Commit e683ee2a by Johannes Singler Committed by Johannes Singler

re PR libstdc++/33893 ([parallel mode] Algorithms rely on omp_set_dynamic(false))

2007-11-22  Johannes Singler  <singler@ira.uka.de>

        PR libstdc++/33893
        * include/parallel/multiway_merge.h: made omp_dynamic-safe
        * include/parallel/workstealing.h: made omp_dynamic-safe
        * include/parallel/base.h: infrastructure, cleanup
        * include/parallel/par_loop.h: made omp_dynamic-safe
        * include/parallel/features.h: activate loser tree variant
        * include/parallel/quicksort.h: made omp_dynamic-safe
        * include/parallel/compiletime_settings.h: settings overridable
        * include/parallel/equally_split.h: made omp_dynamic-safe
        * include/parallel/omp_loop_static.h: made omp_dynamic-safe
        * include/parallel/random_shuffle.h: made omp_dynamic-safe
        * include/parallel/balanced_quicksort.h: made omp_dynamic-safe
        * include/parallel/set_operations.h: made omp_dynamic-safe
        * include/parallel/unique_copy.h: made omp_dynamic-safe
        * include/parallel/multiway_mergesort.h: made omp_dynamic-safe
        * include/parallel/search.h: made omp_dynamic-safe
        * include/parallel/partition.h: made omp_dynamic-safe
        * include/parallel/partial_sum.h: made omp_dynamic-safe
        * include/parallel/find.h: made omp_dynamic-safe
        * include/parallel/omp_loop.h: made omp_dynamic-safe
        * include/parallel/losertree.h: avoid default constructor

From-SVN: r130347
parent 7861a5ce
2007-11-22 Johannes Singler <singler@ira.uka.de>
PR libstdc++/33893
* include/parallel/multiway_merge.h: made omp_dynamic-safe
* include/parallel/workstealing.h: made omp_dynamic-safe
* include/parallel/base.h: infrastructure, cleanup
* include/parallel/par_loop.h: made omp_dynamic-safe
* include/parallel/features.h: activate loser tree variant
* include/parallel/quicksort.h: made omp_dynamic-safe
* include/parallel/compiletime_settings.h: settings overridable
* include/parallel/equally_split.h: made omp_dynamic-safe
* include/parallel/omp_loop_static.h: made omp_dynamic-safe
* include/parallel/random_shuffle.h: made omp_dynamic-safe
* include/parallel/balanced_quicksort.h: made omp_dynamic-safe
* include/parallel/set_operations.h: made omp_dynamic-safe
* include/parallel/unique_copy.h: made omp_dynamic-safe
* include/parallel/multiway_mergesort.h: made omp_dynamic-safe
* include/parallel/search.h: made omp_dynamic-safe
* include/parallel/partition.h: made omp_dynamic-safe
* include/parallel/partial_sum.h: made omp_dynamic-safe
* include/parallel/find.h: made omp_dynamic-safe
* include/parallel/omp_loop.h: made omp_dynamic-safe
* include/parallel/losertree.h: avoid default constructor
2007-11-21 Jonathan Wakely <jwakely.gcc@gmail.com>
* docs/html/17_intro/C++STYLE: Fix typos.
......
......@@ -63,8 +63,8 @@
namespace __gnu_parallel
{
/** @brief Information local to one thread in the parallel quicksort run. */
template<typename RandomAccessIterator>
/** @brief Information local to one thread in the parallel quicksort run. */
template<typename RandomAccessIterator>
struct QSBThreadLocal
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
......@@ -94,29 +94,17 @@ namespace __gnu_parallel
QSBThreadLocal(int queue_size) : leftover_parts(queue_size) { }
};
/** @brief Initialize the thread local storage.
* @param tls Array of thread-local storages.
* @param queue_size Size of the work-stealing queue. */
template<typename RandomAccessIterator>
inline void
qsb_initialize(QSBThreadLocal<RandomAccessIterator>** tls, int queue_size)
{
int iam = omp_get_thread_num();
tls[iam] = new QSBThreadLocal<RandomAccessIterator>(queue_size);
}
/** @brief Balanced quicksort divide step.
/** @brief Balanced quicksort divide step.
* @param begin Begin iterator of subsequence.
* @param end End iterator of subsequence.
* @param comp Comparator.
* @param num_threads Number of threads that are allowed to work on
* this part.
* @pre @c (end-begin)>=1 */
template<typename RandomAccessIterator, typename Comparator>
template<typename RandomAccessIterator, typename Comparator>
inline typename std::iterator_traits<RandomAccessIterator>::difference_type
qsb_divide(RandomAccessIterator begin, RandomAccessIterator end,
Comparator comp, int num_threads)
Comparator comp, thread_index_t num_threads)
{
_GLIBCXX_PARALLEL_ASSERT(num_threads > 0);
......@@ -124,13 +112,15 @@ namespace __gnu_parallel
typedef typename traits_type::value_type value_type;
typedef typename traits_type::difference_type difference_type;
RandomAccessIterator pivot_pos = median_of_three_iterators(begin, begin + (end - begin) / 2, end - 1, comp);
RandomAccessIterator pivot_pos = median_of_three_iterators(
begin, begin + (end - begin) / 2, end - 1, comp);
#if defined(_GLIBCXX_ASSERTIONS)
// Must be in between somewhere.
difference_type n = end - begin;
_GLIBCXX_PARALLEL_ASSERT((!comp(*pivot_pos, *begin) && !comp(*(begin + n / 2), *pivot_pos))
_GLIBCXX_PARALLEL_ASSERT(
(!comp(*pivot_pos, *begin) && !comp(*(begin + n / 2), *pivot_pos))
|| (!comp(*pivot_pos, *begin) && !comp(*end, *pivot_pos))
|| (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*begin, *pivot_pos))
|| (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*end, *pivot_pos))
......@@ -143,10 +133,12 @@ namespace __gnu_parallel
std::swap(*pivot_pos, *(end - 1));
pivot_pos = end - 1;
__gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, *pivot_pos);
__gnu_parallel::binder2nd<Comparator, value_type, value_type, bool>
pred(comp, *pivot_pos);
// Divide, returning end - begin - 1 in the worst case.
difference_type split_pos = parallel_partition(begin, end - 1, pred, num_threads);
difference_type split_pos = parallel_partition(
begin, end - 1, pred, num_threads);
// Swap back pivot to middle.
std::swap(*(begin + split_pos), *pivot_pos);
......@@ -163,18 +155,21 @@ namespace __gnu_parallel
return split_pos;
}
/** @brief Quicksort conquer step.
/** @brief Quicksort conquer step.
* @param tls Array of thread-local storages.
* @param begin Begin iterator of subsequence.
* @param end End iterator of subsequence.
* @param comp Comparator.
* @param iam Number of the thread processing this function.
* @param num_threads Number of threads that are allowed to work on this part. */
template<typename RandomAccessIterator, typename Comparator>
* @param num_threads
* Number of threads that are allowed to work on this part. */
template<typename RandomAccessIterator, typename Comparator>
inline void
qsb_conquer(QSBThreadLocal<RandomAccessIterator>** tls,
RandomAccessIterator begin, RandomAccessIterator end,
Comparator comp, thread_index_t iam, thread_index_t num_threads)
Comparator comp,
thread_index_t iam, thread_index_t num_threads,
bool parent_wait)
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type;
......@@ -182,12 +177,12 @@ namespace __gnu_parallel
difference_type n = end - begin;
if (num_threads <= 1 || n < 2)
if (num_threads <= 1 || n <= 1)
{
tls[iam]->initial.first = begin;
tls[iam]->initial.second = end;
qsb_local_sort_with_helping(tls, comp, iam);
qsb_local_sort_with_helping(tls, comp, iam, parent_wait);
return;
}
......@@ -199,33 +194,55 @@ namespace __gnu_parallel
_GLIBCXX_PARALLEL_ASSERT(0 <= split_pos && split_pos < (end - begin));
#endif
thread_index_t num_threads_leftside = std::max<thread_index_t>(1, std::min<thread_index_t>(num_threads - 1, split_pos * num_threads / n));
thread_index_t num_threads_leftside =
std::max<thread_index_t>(1, std::min<thread_index_t>(
num_threads - 1, split_pos * num_threads / n));
#pragma omp atomic
# pragma omp atomic
*tls[iam]->elements_leftover -= (difference_type)1;
// Conquer step.
#pragma omp parallel sections num_threads(2)
# pragma omp parallel num_threads(2)
{
bool wait;
if(omp_get_num_threads() < 2)
wait = false;
else
wait = parent_wait;
# pragma omp sections
{
#pragma omp section
qsb_conquer(tls, begin, begin + split_pos, comp, iam, num_threads_leftside);
# pragma omp section
{
qsb_conquer(tls, begin, begin + split_pos, comp,
iam,
num_threads_leftside,
wait);
wait = parent_wait;
}
// The pivot_pos is left in place, to ensure termination.
#pragma omp section
# pragma omp section
{
qsb_conquer(tls, begin + split_pos + 1, end, comp,
iam + num_threads_leftside, num_threads - num_threads_leftside);
iam + num_threads_leftside,
num_threads - num_threads_leftside,
wait);
wait = parent_wait;
}
}
}
}
/**
/**
* @brief Quicksort step doing load-balanced local sort.
* @param tls Array of thread-local storages.
* @param comp Comparator.
* @param iam Number of the thread processing this function.
*/
template<typename RandomAccessIterator, typename Comparator>
template<typename RandomAccessIterator, typename Comparator>
inline void
qsb_local_sort_with_helping(QSBThreadLocal<RandomAccessIterator>** tls,
Comparator& comp, int iam)
Comparator& comp, int iam, bool wait)
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type;
......@@ -265,7 +282,9 @@ namespace __gnu_parallel
std::swap(*pivot_pos, *(end - 1));
pivot_pos = end - 1;
__gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, *pivot_pos);
__gnu_parallel::binder2nd
<Comparator, value_type, value_type, bool>
pred(comp, *pivot_pos);
// Divide, leave pivot unchanged in last place.
RandomAccessIterator split_pos1, split_pos2;
......@@ -286,16 +305,19 @@ namespace __gnu_parallel
{
// Very unequal split, one part smaller than one 128th
// elements not strictly larger than the pivot.
__gnu_parallel::unary_negate<__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>, value_type> pred(__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>(comp, *pivot_pos));
__gnu_parallel::unary_negate<__gnu_parallel::binder1st
<Comparator, value_type, value_type, bool>, value_type>
pred(__gnu_parallel::binder1st
<Comparator, value_type, value_type, bool>(
comp, *pivot_pos));
// Find other end of pivot-equal range.
split_pos2 = __gnu_sequential::partition(split_pos1 + 1, end, pred);
split_pos2 = __gnu_sequential::partition(
split_pos1 + 1, end, pred);
}
else
{
// Only skip the pivot.
split_pos2 = split_pos1 + 1;
}
// Elements equal to pivot are done.
elements_done += (split_pos2 - split_pos1);
......@@ -317,7 +339,8 @@ namespace __gnu_parallel
{
// Left side larger.
if (begin != split_pos1)
tl.leftover_parts.push_front(std::make_pair(begin, split_pos1));
tl.leftover_parts.push_front(
std::make_pair(begin, split_pos1));
current.first = split_pos2;
//current.second = end; //already set anyway
......@@ -336,8 +359,9 @@ namespace __gnu_parallel
if (tl.leftover_parts.pop_front(current))
continue;
#pragma omp atomic
# pragma omp atomic
*tl.elements_leftover -= elements_done;
elements_done = 0;
#if _GLIBCXX_ASSERTIONS
......@@ -345,8 +369,8 @@ namespace __gnu_parallel
#endif
// Look for new work.
bool success = false;
while (*tl.elements_leftover > 0 && !success
bool successfully_stolen = false;
while (wait && *tl.elements_leftover > 0 && !successfully_stolen
#if _GLIBCXX_ASSERTIONS
// Possible dead-lock.
&& (omp_get_wtime() < (search_start + 1.0))
......@@ -357,11 +381,12 @@ namespace __gnu_parallel
victim = rng(num_threads);
// Large pieces.
success = (victim != iam) && tls[victim]->leftover_parts.pop_back(current);
if (!success)
successfully_stolen = (victim != iam)
&& tls[victim]->leftover_parts.pop_back(current);
if (!successfully_stolen)
yield();
#if !defined(__ICC) && !defined(__ECC)
#pragma omp flush
# pragma omp flush
#endif
}
......@@ -369,10 +394,11 @@ namespace __gnu_parallel
if (omp_get_wtime() >= (search_start + 1.0))
{
sleep(1);
_GLIBCXX_PARALLEL_ASSERT(omp_get_wtime() < (search_start + 1.0));
_GLIBCXX_PARALLEL_ASSERT(
omp_get_wtime() < (search_start + 1.0));
}
#endif
if (!success)
if (!successfully_stolen)
{
#if _GLIBCXX_ASSERTIONS
_GLIBCXX_PARALLEL_ASSERT(*tl.elements_leftover == 0);
......@@ -383,7 +409,7 @@ namespace __gnu_parallel
}
}
/** @brief Top-level quicksort routine.
/** @brief Top-level quicksort routine.
* @param begin Begin iterator of sequence.
* @param end End iterator of sequence.
* @param comp Comparator.
......@@ -391,11 +417,13 @@ namespace __gnu_parallel
* @param num_threads Number of threads that are allowed to work on
* this part.
*/
template<typename RandomAccessIterator, typename Comparator>
template<typename RandomAccessIterator, typename Comparator>
inline void
parallel_sort_qsb(RandomAccessIterator begin, RandomAccessIterator end,
Comparator comp,
typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads)
typename std::iterator_traits<RandomAccessIterator>
::difference_type n,
thread_index_t num_threads)
{
_GLIBCXX_CALL(end - begin)
......@@ -413,11 +441,11 @@ namespace __gnu_parallel
if (num_threads > n)
num_threads = static_cast<thread_index_t>(n);
// Initialize thread local storage
tls_type** tls = new tls_type*[num_threads];
#pragma omp parallel num_threads(num_threads)
// Initialize variables per processor.
qsb_initialize(tls, num_threads * (thread_index_t)(log2(n) + 1));
difference_type queue_size = num_threads * (thread_index_t)(log2(n) + 1);
for (thread_index_t t = 0; t < num_threads; ++t)
tls[t] = new QSBThreadLocal<RandomAccessIterator>(queue_size);
// There can never be more than ceil(log2(n)) ranges on the stack, because
// 1. Only one processor pushes onto the stack
......@@ -434,14 +462,8 @@ namespace __gnu_parallel
tls[i]->initial = std::make_pair(end, end);
}
// Initial splitting, recursively.
int old_nested = omp_get_nested();
omp_set_nested(true);
// Main recursion call.
qsb_conquer(tls, begin, begin + n, comp, 0, num_threads);
omp_set_nested(old_nested);
qsb_conquer(tls, begin, begin + n, comp, 0, num_threads, true);
#if _GLIBCXX_ASSERTIONS
// All stack must be empty.
......
......@@ -49,11 +49,11 @@ namespace __gnu_parallel
// XXX remove std::duplicates from here if possible,
// XXX but keep minimal dependencies.
/** @brief Calculates the rounded-down logarithm of @c n for base 2.
/** @brief Calculates the rounded-down logarithm of @c n for base 2.
* @param n Argument.
* @return Returns 0 for argument 0.
*/
template<typename Size>
template<typename Size>
inline Size
log2(Size n)
{
......@@ -63,7 +63,7 @@ namespace __gnu_parallel
return k;
}
/** @brief Encode two integers into one __gnu_parallel::lcas_t.
/** @brief Encode two integers into one __gnu_parallel::lcas_t.
* @param a First integer, to be encoded in the most-significant @c
* lcas_t_bits/2 bits.
* @param b Second integer, to be encoded in the least-significant
......@@ -71,13 +71,13 @@ namespace __gnu_parallel
* @return __gnu_parallel::lcas_t value encoding @c a and @c b.
* @see decode2
*/
inline lcas_t
encode2(int a, int b) //must all be non-negative, actually
{
inline lcas_t
encode2(int a, int b) //must all be non-negative, actually
{
return (((lcas_t)a) << (lcas_t_bits / 2)) | (((lcas_t)b) << 0);
}
}
/** @brief Decode two integers from one __gnu_parallel::lcas_t.
/** @brief Decode two integers from one __gnu_parallel::lcas_t.
* @param x __gnu_parallel::lcas_t to decode integers from.
* @param a First integer, to be decoded from the most-significant
* @c lcas_t_bits/2 bits of @c x.
......@@ -85,18 +85,34 @@ namespace __gnu_parallel
* @c lcas_t_bits/2 bits of @c x.
* @see encode2
*/
inline void
decode2(lcas_t x, int& a, int& b)
{
inline void
decode2(lcas_t x, int& a, int& b)
{
a = (int)((x >> (lcas_t_bits / 2)) & lcas_t_mask);
b = (int)((x >> 0 ) & lcas_t_mask);
}
}
/** @brief Equivalent to std::min. */
template<typename T>
const T&
min(const T& a, const T& b)
{
return (a < b) ? a : b;
};
/** @brief Equivalent to std::max. */
template<typename T>
const T&
max(const T& a, const T& b)
{
return (a > b) ? a : b;
};
/** @brief Constructs predicate for equality from strict weak
/** @brief Constructs predicate for equality from strict weak
* ordering predicate
*/
// XXX comparator at the end, as per others
template<typename Comparator, typename T1, typename T2>
// XXX comparator at the end, as per others
template<typename Comparator, typename T1, typename T2>
class equal_from_less : public std::binary_function<T1, T2, bool>
{
private:
......@@ -112,8 +128,9 @@ namespace __gnu_parallel
};
/** @brief Similar to std::binder1st, but giving the argument types explicitly. */
template<typename _Predicate, typename argument_type>
/** @brief Similar to std::binder1st,
* but giving the argument types explicitly. */
template<typename _Predicate, typename argument_type>
class unary_negate
: public std::unary_function<argument_type, bool>
{
......@@ -129,8 +146,13 @@ namespace __gnu_parallel
{ return !_M_pred(__x); }
};
/** @brief Similar to std::binder1st, but giving the argument types explicitly. */
template<typename _Operation, typename first_argument_type, typename second_argument_type, typename result_type>
/** @brief Similar to std::binder1st,
* but giving the argument types explicitly. */
template<
typename _Operation,
typename first_argument_type,
typename second_argument_type,
typename result_type>
class binder1st
: public std::unary_function<second_argument_type, result_type>
{
......@@ -154,11 +176,15 @@ namespace __gnu_parallel
{ return op(value, __x); }
};
/**
/**
* @brief Similar to std::binder2nd, but giving the argument types
* explicitly.
*/
template<typename _Operation, typename first_argument_type, typename second_argument_type, typename result_type>
template<
typename _Operation,
typename first_argument_type,
typename second_argument_type,
typename result_type>
class binder2nd
: public std::unary_function<first_argument_type, result_type>
{
......@@ -182,16 +208,16 @@ namespace __gnu_parallel
{ return op(__x, value); }
};
/** @brief Similar to std::equal_to, but allows two different types. */
template<typename T1, typename T2>
/** @brief Similar to std::equal_to, but allows two different types. */
template<typename T1, typename T2>
struct equal_to : std::binary_function<T1, T2, bool>
{
bool operator()(const T1& t1, const T2& t2) const
{ return t1 == t2; }
};
/** @brief Similar to std::less, but allows two different types. */
template<typename T1, typename T2>
/** @brief Similar to std::less, but allows two different types. */
template<typename T1, typename T2>
struct less : std::binary_function<T1, T2, bool>
{
bool
......@@ -203,9 +229,9 @@ namespace __gnu_parallel
{ return t2 < t1; }
};
// Partial specialization for one type. Same as std::less.
template<typename _Tp>
struct less<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, bool>
// Partial specialization for one type. Same as std::less.
template<typename _Tp>
struct less<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, bool>
{
bool
operator()(const _Tp& __x, const _Tp& __y) const
......@@ -214,21 +240,23 @@ namespace __gnu_parallel
/** @brief Similar to std::plus, but allows two different types. */
template<typename _Tp1, typename _Tp2>
template<typename _Tp1, typename _Tp2>
struct plus : public std::binary_function<_Tp1, _Tp2, _Tp1>
{
typedef typeof(*static_cast<_Tp1*>(NULL) + *static_cast<_Tp2*>(NULL)) result;
typedef typeof(*static_cast<_Tp1*>(NULL)
+ *static_cast<_Tp2*>(NULL)) result;
result
operator()(const _Tp1& __x, const _Tp2& __y) const
{ return __x + __y; }
};
// Partial specialization for one type. Same as std::plus.
template<typename _Tp>
// Partial specialization for one type. Same as std::plus.
template<typename _Tp>
struct plus<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp>
{
typedef typeof(*static_cast<_Tp*>(NULL) + *static_cast<_Tp*>(NULL)) result;
typedef typeof(*static_cast<_Tp*>(NULL)
+ *static_cast<_Tp*>(NULL)) result;
result
operator()(const _Tp& __x, const _Tp& __y) const
......@@ -236,22 +264,24 @@ namespace __gnu_parallel
};
/** @brief Similar to std::multiplies, but allows two different types. */
template<typename _Tp1, typename _Tp2>
/** @brief Similar to std::multiplies, but allows two different types. */
template<typename _Tp1, typename _Tp2>
struct multiplies : public std::binary_function<_Tp1, _Tp2, _Tp1>
{
typedef typeof(*static_cast<_Tp1*>(NULL) * *static_cast<_Tp2*>(NULL)) result;
typedef typeof(*static_cast<_Tp1*>(NULL)
* *static_cast<_Tp2*>(NULL)) result;
result
operator()(const _Tp1& __x, const _Tp2& __y) const
{ return __x * __y; }
};
// Partial specialization for one type. Same as std::multiplies.
template<typename _Tp>
// Partial specialization for one type. Same as std::multiplies.
template<typename _Tp>
struct multiplies<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp>
{
typedef typeof(*static_cast<_Tp*>(NULL) * *static_cast<_Tp*>(NULL)) result;
typedef typeof(*static_cast<_Tp*>(NULL)
* *static_cast<_Tp*>(NULL)) result;
result
operator()(const _Tp& __x, const _Tp& __y) const
......@@ -259,15 +289,15 @@ namespace __gnu_parallel
};
template<typename T, typename _DifferenceTp>
template<typename T, typename _DifferenceTp>
class pseudo_sequence;
/** @brief Iterator associated with __gnu_parallel::pseudo_sequence.
/** @brief Iterator associated with __gnu_parallel::pseudo_sequence.
* If features the usual random-access iterator functionality.
* @param T Sequence value type.
* @param difference_type Sequence difference type.
*/
template<typename T, typename _DifferenceTp>
template<typename T, typename _DifferenceTp>
class pseudo_sequence_iterator
{
public:
......@@ -317,13 +347,13 @@ namespace __gnu_parallel
{ return pos - i2.pos; }
};
/** @brief Sequence that conceptually consists of multiple copies of
/** @brief Sequence that conceptually consists of multiple copies of
the same element.
* The copies are not stored explicitly, of course.
* @param T Sequence value type.
* @param difference_type Sequence difference type.
*/
template<typename T, typename _DifferenceTp>
template<typename T, typename _DifferenceTp>
class pseudo_sequence
{
typedef pseudo_sequence<T, _DifferenceTp> type;
......@@ -356,23 +386,23 @@ namespace __gnu_parallel
difference_type count;
};
/** @brief Functor that does nothing */
template<typename _ValueTp>
/** @brief Functor that does nothing */
template<typename _ValueTp>
class void_functor
{
inline void
operator()(const _ValueTp& v) const { }
};
/** @brief Compute the median of three referenced elements,
/** @brief Compute the median of three referenced elements,
according to @c comp.
* @param a First iterator.
* @param b Second iterator.
* @param c Third iterator.
* @param comp Comparator.
*/
template<typename RandomAccessIterator, typename Comparator>
RandomAccessIterator
template<typename RandomAccessIterator, typename Comparator>
RandomAccessIterator
median_of_three_iterators(RandomAccessIterator a, RandomAccessIterator b,
RandomAccessIterator c, Comparator& comp)
{
......@@ -397,19 +427,19 @@ namespace __gnu_parallel
}
}
// Avoid the use of assert, because we're trying to keep the <cassert>
// include out of the mix. (Same as debug mode).
inline void
__replacement_assert(const char* __file, int __line,
// Avoid the use of assert, because we're trying to keep the <cassert>
// include out of the mix. (Same as debug mode).
inline void
__replacement_assert(const char* __file, int __line,
const char* __function, const char* __condition)
{
{
std::printf("%s:%d: %s: Assertion '%s' failed.\n", __file, __line,
__function, __condition);
__builtin_abort();
}
}
#define _GLIBCXX_PARALLEL_ASSERT(_Condition) \
do \
do \
{ \
if (!(_Condition)) \
__gnu_parallel::__replacement_assert(__FILE__, __LINE__, \
......@@ -419,4 +449,3 @@ namespace __gnu_parallel
} //namespace __gnu_parallel
#endif
......@@ -39,7 +39,7 @@
#include <cstdio>
/** @brief Determine verbosity level of the parallel mode.
* Level 1 prints a message each time when entering a parallel-mode function. */
* Level 1 prints a message each time a parallel-mode function is entered. */
#define _GLIBCXX_VERBOSE_LEVEL 0
/** @def _GLIBCXX_CALL
......@@ -50,27 +50,40 @@
#define _GLIBCXX_CALL(n)
#endif
#if (_GLIBCXX_VERBOSE_LEVEL == 1)
#define _GLIBCXX_CALL(n) printf(" %s:\niam = %d, n = %ld, num_threads = %d\n", __PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads());
#define _GLIBCXX_CALL(n) \
printf(" %s:\niam = %d, n = %ld, num_threads = %d\n", \
__PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads());
#endif
#ifndef _GLIBCXX_SCALE_DOWN_FPU
/** @brief Use floating-point scaling instead of modulo for mapping
* random numbers to a range. This can be faster on certain CPUs. */
#define _GLIBCXX_SCALE_DOWN_FPU 0
#endif
#ifndef _GLIBCXX_ASSERTIONS
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
* Should be switched on only locally. */
#define _GLIBCXX_ASSERTIONS 0
#endif
#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
* Consider the size of the L1 cache for __gnu_parallel::parallel_random_shuffle(). */
* Consider the size of the L1 cache for
* __gnu_parallel::parallel_random_shuffle(). */
#define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 0
#endif
#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
* Consider the size of the TLB for __gnu_parallel::parallel_random_shuffle(). */
* Consider the size of the TLB for
* __gnu_parallel::parallel_random_shuffle(). */
#define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 0
#endif
#ifndef _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
/** @brief First copy the data, sort it locally, and merge it back
* (0); or copy it back after everything is done (1).
*
* Recommendation: 0 */
#define _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST 0
#endif
......@@ -39,30 +39,58 @@
namespace __gnu_parallel
{
/** @brief Function to split a sequence into parts of almost equal size.
/** @brief Function to split a sequence into parts of almost equal size.
*
* The resulting sequence s of length p+1 contains the splitting
* The resulting sequence s of length num_threads+1 contains the splitting
* positions when splitting the range [0,n) into parts of almost
* equal size (plus minus 1). The first entry is 0, the last one
* n. There may result empty parts.
* @param n Number of elements
* @param p Number of parts
* @param num_threads Number of parts
* @param s Splitters
* @returns End of splitter sequence, i. e. @c s+p+1 */
template<typename _DifferenceTp, typename OutputIterator>
* @returns End of splitter sequence, i. e. @c s+num_threads+1 */
template<typename difference_type, typename OutputIterator>
OutputIterator
equally_split(_DifferenceTp n, thread_index_t p, OutputIterator s)
equally_split(difference_type n,
thread_index_t num_threads,
OutputIterator s)
{
typedef _DifferenceTp difference_type;
difference_type chunk_length = n / p, split = n % p, start = 0;
for (int i = 0; i < p; i++)
difference_type chunk_length = n / num_threads,
num_longer_chunks = n % num_threads,
pos = 0;
for (thread_index_t i = 0; i < num_threads; ++i)
{
*s++ = start;
start += (difference_type(i) < split) ? (chunk_length + 1) : chunk_length;
*s++ = pos;
pos += (i < num_longer_chunks) ? (chunk_length + 1) : chunk_length;
}
*s++ = n;
return s;
}
/** @brief Function to split a sequence into parts of almost equal size.
*
* Returns the position of the splitting point between
* thread number thread_no (included) and
* thread number thread_no+1 (excluded).
* @param n Number of elements
* @param num_threads Number of parts
* @returns Splitting point */
template<typename difference_type>
difference_type
equally_split_point(difference_type n,
thread_index_t num_threads,
thread_index_t thread_no)
{
difference_type chunk_length = n / num_threads,
num_longer_chunks = n % num_threads;
if(thread_no < num_longer_chunks)
return thread_no * (chunk_length + 1);
else
return num_longer_chunks * (chunk_length + 1)
+ (thread_no - num_longer_chunks) * chunk_length;
}
}
#endif
......@@ -66,7 +66,7 @@
* @brief Include guarded (sequences may run empty) loser tree,
* moving objects.
* @see __gnu_parallel::Settings multiway_merge_algorithm */
#define _GLIBCXX_LOSER_TREE 0
#define _GLIBCXX_LOSER_TREE 1
#endif
#ifndef _GLIBCXX_LOSER_TREE_EXPLICIT
......
......@@ -10,7 +10,7 @@
// This library is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURstartE. See the GNU
// General Public License for more details.
// You should have received a copy of the GNU General Public License
......@@ -48,7 +48,7 @@
namespace __gnu_parallel
{
/**
/**
* @brief Parallel std::find, switch for different algorithms.
* @param begin1 Begin iterator of first sequence.
* @param end1 End iterator of first sequence.
......@@ -58,7 +58,11 @@ namespace __gnu_parallel
* @param selector Functionality (e. g. std::find_if (), std::equal(),...)
* @return Place of finding in both sequences.
*/
template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector>
template<
typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename Pred,
typename Selector>
std::pair<RandomAccessIterator1, RandomAccessIterator2>
find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1,
RandomAccessIterator2 begin2, Pred pred, Selector selector)
......@@ -66,11 +70,14 @@ namespace __gnu_parallel
switch (Settings::find_distribution)
{
case Settings::GROWING_BLOCKS:
return find_template(begin1, end1, begin2, pred, selector, growing_blocks_tag());
return find_template(begin1, end1, begin2, pred, selector,
growing_blocks_tag());
case Settings::CONSTANT_SIZE_BLOCKS:
return find_template(begin1, end1, begin2, pred, selector, constant_size_blocks_tag());
return find_template(begin1, end1, begin2, pred, selector,
constant_size_blocks_tag());
case Settings::EQUAL_SPLIT:
return find_template(begin1, end1, begin2, pred, selector, equal_split_tag());
return find_template(begin1, end1, begin2, pred, selector,
equal_split_tag());
default:
_GLIBCXX_PARALLEL_ASSERT(false);
return std::make_pair(begin1, begin2);
......@@ -79,7 +86,7 @@ namespace __gnu_parallel
#if _GLIBCXX_FIND_EQUAL_SPLIT
/**
/**
* @brief Parallel std::find, equal splitting variant.
* @param begin1 Begin iterator of first sequence.
* @param end1 End iterator of first sequence.
......@@ -89,9 +96,18 @@ namespace __gnu_parallel
* @param selector Functionality (e. g. std::find_if (), std::equal(),...)
* @return Place of finding in both sequences.
*/
template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector>
template<
typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename Pred,
typename Selector>
std::pair<RandomAccessIterator1, RandomAccessIterator2>
find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1, RandomAccessIterator2 begin2, Pred pred, Selector selector, equal_split_tag)
find_template(RandomAccessIterator1 begin1,
RandomAccessIterator1 end1,
RandomAccessIterator2 begin2,
Pred pred,
Selector selector,
equal_split_tag)
{
_GLIBCXX_CALL(end1 - begin1)
......@@ -100,27 +116,30 @@ namespace __gnu_parallel
typedef typename traits_type::value_type value_type;
difference_type length = end1 - begin1;
difference_type result = length;
difference_type* borders;
const thread_index_t num_threads = get_max_threads();
omp_lock_t result_lock;
omp_init_lock(&result_lock);
difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 1)));
thread_index_t num_threads = get_max_threads();
# pragma omp parallel num_threads(num_threads)
{
# pragma omp single
{
num_threads = omp_get_num_threads();
borders = new difference_type[num_threads + 1];
equally_split(length, num_threads, borders);
} //single
#pragma omp parallel shared(result) num_threads(num_threads)
{
int iam = omp_get_thread_num();
difference_type pos = borders[iam], limit = borders[iam + 1];
thread_index_t iam = omp_get_thread_num();
difference_type start = borders[iam], stop = borders[iam + 1];
RandomAccessIterator1 i1 = begin1 + pos;
RandomAccessIterator2 i2 = begin2 + pos;
for (; pos < limit; pos++)
RandomAccessIterator1 i1 = begin1 + start;
RandomAccessIterator2 i2 = begin2 + start;
for (difference_type pos = start; pos < stop; ++pos)
{
#pragma omp flush(result)
#pragma omp flush(result)
// Result has been set to something lower.
if (result < pos)
break;
......@@ -128,25 +147,28 @@ namespace __gnu_parallel
if (selector(i1, i2, pred))
{
omp_set_lock(&result_lock);
if (result > pos)
if (pos < result)
result = pos;
omp_unset_lock(&result_lock);
break;
}
i1++;
i2++;
}
++i1;
++i2;
}
} //parallel
omp_destroy_lock(&result_lock);
return std::pair<RandomAccessIterator1, RandomAccessIterator2>(begin1 + result, begin2 + result);
delete[] borders;
return std::pair<RandomAccessIterator1, RandomAccessIterator2>(
begin1 + result, begin2 + result);
}
#endif
#if _GLIBCXX_FIND_GROWING_BLOCKS
/**
/**
* @brief Parallel std::find, growing block size variant.
* @param begin1 Begin iterator of first sequence.
* @param end1 End iterator of first sequence.
......@@ -168,7 +190,11 @@ namespace __gnu_parallel
* for CSB, the blocks are allocated in a predetermined manner,
* namely spacial round-robin.
*/
template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector>
template<
typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename Pred,
typename Selector>
std::pair<RandomAccessIterator1, RandomAccessIterator2>
find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1,
RandomAccessIterator2 begin2, Pred pred, Selector selector,
......@@ -182,39 +208,46 @@ namespace __gnu_parallel
difference_type length = end1 - begin1;
difference_type sequential_search_size = std::min<difference_type>(length, Settings::find_sequential_search_size);
difference_type sequential_search_size = std::min<difference_type>(
length, Settings::find_sequential_search_size);
// Try it sequentially first.
std::pair<RandomAccessIterator1, RandomAccessIterator2> find_seq_result =
selector.sequential_algorithm(begin1, begin1 + sequential_search_size, begin2, pred);
selector.sequential_algorithm(
begin1, begin1 + sequential_search_size, begin2, pred);
if (find_seq_result.first != (begin1 + sequential_search_size))
return find_seq_result;
// Index of beginning of next free block (after sequential find).
difference_type next_block_pos = sequential_search_size;
difference_type next_block_start = sequential_search_size;
difference_type result = length;
const thread_index_t num_threads = get_max_threads();
omp_lock_t result_lock;
omp_init_lock(&result_lock);
#pragma omp parallel shared(result) num_threads(num_threads)
thread_index_t num_threads = get_max_threads();
# pragma omp parallel shared(result) num_threads(num_threads)
{
# pragma omp single
num_threads = omp_get_num_threads();
// Not within first k elements -> start parallel.
thread_index_t iam = omp_get_thread_num();
difference_type block_size = Settings::find_initial_block_size;
difference_type start = fetch_and_add<difference_type>(&next_block_pos, block_size);
difference_type start =
fetch_and_add<difference_type>(&next_block_start, block_size);
// Get new block, update pointer to next block.
difference_type stop = std::min<difference_type>(length, start + block_size);
difference_type stop =
std::min<difference_type>(length, start + block_size);
std::pair<RandomAccessIterator1, RandomAccessIterator2> local_result;
while (start < length)
{
#pragma omp flush(result)
# pragma omp flush(result)
// Get new value of result.
if (result < start)
{
......@@ -222,7 +255,8 @@ namespace __gnu_parallel
break;
}
local_result = selector.sequential_algorithm(begin1 + start, begin1 + stop, begin2 + start, pred);
local_result = selector.sequential_algorithm(
begin1 + start, begin1 + stop, begin2 + start, pred);
if (local_result.first != (begin1 + stop))
{
omp_set_lock(&result_lock);
......@@ -231,30 +265,35 @@ namespace __gnu_parallel
result = local_result.first - begin1;
// Result cannot be in future blocks, stop algorithm.
fetch_and_add<difference_type>(&next_block_pos, length);
fetch_and_add<difference_type>(&next_block_start, length);
}
omp_unset_lock(&result_lock);
}
block_size = std::min<difference_type>(block_size * Settings::find_increasing_factor, Settings::find_maximum_block_size);
block_size = std::min<difference_type>(
block_size * Settings::find_increasing_factor,
Settings::find_maximum_block_size);
// Get new block, update pointer to next block.
start = fetch_and_add<difference_type>(&next_block_pos, block_size);
stop = (length < (start + block_size)) ? length : (start + block_size);
}
start =
fetch_and_add<difference_type>(&next_block_start, block_size);
stop = (length < (start + block_size)) ?
length : (start + block_size);
}
} //parallel
omp_destroy_lock(&result_lock);
// Return iterator on found element.
return std::pair<RandomAccessIterator1, RandomAccessIterator2>(begin1 + result, begin2 + result);
return std::pair<RandomAccessIterator1, RandomAccessIterator2>(
begin1 + result, begin2 + result);
}
#endif
#if _GLIBCXX_FIND_CONSTANT_SIZE_BLOCKS
/**
/**
* @brief Parallel std::find, constant block size variant.
* @param begin1 Begin iterator of first sequence.
* @param end1 End iterator of first sequence.
......@@ -272,7 +311,11 @@ namespace __gnu_parallel
* blocks are allocated in a predetermined manner, namely spacial
* round-robin.
*/
template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector>
template<
typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename Pred,
typename Selector>
std::pair<RandomAccessIterator1, RandomAccessIterator2>
find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1,
RandomAccessIterator2 begin2, Pred pred, Selector selector,
......@@ -285,47 +328,52 @@ namespace __gnu_parallel
difference_type length = end1 - begin1;
difference_type sequential_search_size = std::min<difference_type>(length, Settings::find_sequential_search_size);
difference_type sequential_search_size = std::min<difference_type>(
length, Settings::find_sequential_search_size);
// Try it sequentially first.
std::pair<RandomAccessIterator1, RandomAccessIterator2> find_seq_result =
selector.sequential_algorithm(begin1, begin1 + sequential_search_size, begin2, pred);
selector.sequential_algorithm(begin1, begin1 + sequential_search_size,
begin2, pred);
if (find_seq_result.first != (begin1 + sequential_search_size))
return find_seq_result;
difference_type result = length;
const thread_index_t num_threads = get_max_threads();
omp_lock_t result_lock;
omp_init_lock(&result_lock);
// Not within first sequential_search_size elements -> start parallel.
#pragma omp parallel shared(result) num_threads(num_threads)
thread_index_t num_threads = get_max_threads();
# pragma omp parallel shared(result) num_threads(num_threads)
{
# pragma omp single
num_threads = omp_get_num_threads();
thread_index_t iam = omp_get_thread_num();
difference_type block_size = Settings::find_initial_block_size;
difference_type start, stop;
// First element of thread's current iteration.
difference_type iteration_start = sequential_search_size;
// Where to work (initialization).
start = iteration_start + iam * block_size;
stop = std::min<difference_type>(length, start + block_size);
difference_type start = iteration_start + iam * block_size;
difference_type stop =
std::min<difference_type>(length, start + block_size);
std::pair<RandomAccessIterator1, RandomAccessIterator2> local_result;
while (start < length)
{
// Get new value of result.
#pragma omp flush(result)
# pragma omp flush(result)
// No chance to find first element.
if (result < start)
break;
local_result = selector.sequential_algorithm(begin1 + start, begin1 + stop, begin2 + start, pred);
local_result = selector.sequential_algorithm(
begin1 + start, begin1 + stop,
begin2 + start, pred);
if (local_result.first != (begin1 + stop))
{
omp_set_lock(&result_lock);
......@@ -342,15 +390,15 @@ namespace __gnu_parallel
start = iteration_start + iam * block_size;
stop = std::min<difference_type>(length, start + block_size);
}
}
} //parallel
omp_destroy_lock(&result_lock);
// Return iterator on found element.
return std::pair<RandomAccessIterator1, RandomAccessIterator2>(begin1 + result, begin2 + result);
return std::pair<RandomAccessIterator1, RandomAccessIterator2>(
begin1 + result, begin2 + result);
}
#endif
} // end namespace
#endif
......@@ -29,9 +29,9 @@
// Public License.
/** @file parallel/losertree.h
* @brief Many generic loser tree variants.
* This file is a GNU parallel extension to the Standard C++ Library.
*/
* @brief Many generic loser tree variants.
* This file is a GNU parallel extension to the Standard C++ Library.
*/
// Written by Johannes Singler.
......@@ -49,13 +49,13 @@ namespace __gnu_parallel
#if _GLIBCXX_LOSER_TREE_EXPLICIT
/** @brief Guarded loser tree, copying the whole element into the
* tree structure.
*
* Guarding is done explicitly through two flags per element, inf
* and sup This is a quite slow variant.
*/
template<typename T, typename Comparator = std::less<T> >
/** @brief Guarded loser tree, copying the whole element into the
* tree structure.
*
* Guarding is done explicitly through two flags per element, inf
* and sup This is a quite slow variant.
*/
template<typename T, typename Comparator = std::less<T> >
class LoserTreeExplicit
{
private:
......@@ -76,7 +76,9 @@ namespace __gnu_parallel
Comparator comp;
public:
inline LoserTreeExplicit(unsigned int _size, Comparator _comp = std::less<T>()) : comp(_comp)
inline
LoserTreeExplicit(unsigned int _size, Comparator _comp = std::less<T>())
: comp(_comp)
{
size = _size;
offset = size;
......@@ -93,9 +95,6 @@ namespace __gnu_parallel
inline ~LoserTreeExplicit()
{ delete[] losers; }
inline void
print() { }
inline int
get_min_source()
{ return losers[0].source; }
......@@ -106,7 +105,8 @@ namespace __gnu_parallel
bool inf = false;
for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2)
{
if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup && comp(losers[pos].key, key)) || losers[pos].inf || sup)
if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup
&& comp(losers[pos].key, key)) || losers[pos].inf || sup)
{
// The other one is smaller.
std::swap(losers[pos].key, key);
......@@ -209,14 +209,14 @@ namespace __gnu_parallel
#if _GLIBCXX_LOSER_TREE
/** @brief Guarded loser tree, either copying the whole element into
* the tree structure, or looking up the element via the index.
*
* Guarding is done explicitly through one flag sup per element,
* inf is not needed due to a better initialization routine. This
* is a well-performing variant.
*/
template<typename T, typename Comparator = std::less<T> >
/** @brief Guarded loser tree, either copying the whole element into
* the tree structure, or looking up the element via the index.
*
* Guarding is done explicitly through one flag sup per element,
* inf is not needed due to a better initialization routine. This
* is a well-performing variant.
*/
template<typename T, typename Comparator = std::less<T> >
class LoserTree
{
private:
......@@ -240,7 +240,7 @@ namespace __gnu_parallel
// Next greater power of 2.
k = 1 << (log2(ik - 1) + 1);
offset = k;
losers = new Loser[k * 2];
losers = static_cast<Loser*>(::operator new(k * 2 * sizeof(Loser)));
for (unsigned int i = ik - 1; i < k; i++)
losers[i + k].sup = true;
}
......@@ -248,14 +248,6 @@ namespace __gnu_parallel
inline ~LoserTree()
{ delete[] losers; }
void
print()
{
for (unsigned int i = 0; i < (k * 2); i++)
printf("%d %d from %d, %d\n", i, losers[i].key,
losers[i].source, losers[i].sup);
}
inline int
get_min_source()
{ return losers[0].source; }
......@@ -267,7 +259,7 @@ namespace __gnu_parallel
losers[pos].sup = sup;
losers[pos].source = source;
losers[pos].key = key;
new(&(losers[pos].key)) T(key);
}
unsigned int
......@@ -282,7 +274,8 @@ namespace __gnu_parallel
unsigned int left = init_winner (2 * root);
unsigned int right = init_winner (2 * root + 1);
if (losers[right].sup ||
(!losers[left].sup && !comp(losers[right].key, losers[left].key)))
(!losers[left].sup
&& !comp(losers[right].key, losers[left].key)))
{
// Left one is less or equal.
losers[root] = losers[right];
......@@ -337,8 +330,9 @@ namespace __gnu_parallel
{
unsigned int left = init_winner (2 * root);
unsigned int right = init_winner (2 * root + 1);
if ( losers[right].sup ||
(!losers[left].sup && !comp(losers[right].key, losers[left].key)))
if (losers[right].sup
|| (!losers[left].sup
&& !comp(losers[right].key, losers[left].key)))
{
// Left one is less or equal.
losers[root] = losers[right];
......@@ -365,10 +359,11 @@ namespace __gnu_parallel
for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
{
// The smaller one gets promoted, ties are broken by source.
if ( (sup && (!losers[pos].sup || losers[pos].source < source)) ||
(!sup && !losers[pos].sup &&
((comp(losers[pos].key, key)) ||
(!comp(key, losers[pos].key) && losers[pos].source < source))))
if ( (sup && (!losers[pos].sup || losers[pos].source < source))
|| (!sup && !losers[pos].sup
&& ((comp(losers[pos].key, key))
|| (!comp(key, losers[pos].key)
&& losers[pos].source < source))))
{
// The other one is smaller.
std::swap(losers[pos].sup, sup);
......@@ -387,14 +382,14 @@ namespace __gnu_parallel
#if _GLIBCXX_LOSER_TREE_REFERENCE
/** @brief Guarded loser tree, either copying the whole element into
* the tree structure, or looking up the element via the index.
*
* Guarding is done explicitly through one flag sup per element,
* inf is not needed due to a better initialization routine. This
* is a well-performing variant.
*/
template<typename T, typename Comparator = std::less<T> >
/** @brief Guarded loser tree, either copying the whole element into
* the tree structure, or looking up the element via the index.
*
* Guarding is done explicitly through one flag sup per element,
* inf is not needed due to a better initialization routine. This
* is a well-performing variant.
*/
template<typename T, typename Comparator = std::less<T> >
class LoserTreeReference
{
#undef COPY
......@@ -423,7 +418,9 @@ namespace __gnu_parallel
Comparator comp;
public:
inline LoserTreeReference(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp)
inline
LoserTreeReference(unsigned int _k, Comparator _comp = std::less<T>())
: comp(_comp)
{
ik = _k;
......@@ -446,13 +443,6 @@ namespace __gnu_parallel
#endif
}
void
print()
{
for (unsigned int i = 0; i < (k * 2); i++)
printf("%d %d from %d, %d\n", i, KEY(i), losers[i].source, losers[i].sup);
}
inline int
get_min_source()
{ return losers[0].source; }
......@@ -570,7 +560,8 @@ namespace __gnu_parallel
if ( (sup && (!losers[pos].sup || losers[pos].source < source)) ||
(!sup && !losers[pos].sup &&
((comp(KEY(pos), KEY_SOURCE(source))) ||
(!comp(KEY_SOURCE(source), KEY(pos)) && losers[pos].source < source))))
(!comp(KEY_SOURCE(source), KEY(pos))
&& losers[pos].source < source))))
{
// The other one is smaller.
std::swap(losers[pos].sup, sup);
......@@ -595,13 +586,13 @@ namespace __gnu_parallel
#if _GLIBCXX_LOSER_TREE_POINTER
/** @brief Guarded loser tree, either copying the whole element into
/** @brief Guarded loser tree, either copying the whole element into
the tree structure, or looking up the element via the index.
* Guarding is done explicitly through one flag sup per element,
* inf is not needed due to a better initialization routine.
* This is a well-performing variant.
*/
template<typename T, typename Comparator = std::less<T> >
* Guarding is done explicitly through one flag sup per element,
* inf is not needed due to a better initialization routine.
* This is a well-performing variant.
*/
template<typename T, typename Comparator = std::less<T> >
class LoserTreePointer
{
private:
......@@ -617,7 +608,9 @@ namespace __gnu_parallel
Comparator comp;
public:
inline LoserTreePointer(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp)
inline
LoserTreePointer(unsigned int _k, Comparator _comp = std::less<T>())
: comp(_comp)
{
ik = _k;
......@@ -632,13 +625,6 @@ namespace __gnu_parallel
inline ~LoserTreePointer()
{ delete[] losers; }
void
print()
{
for (unsigned int i = 0; i < (k * 2); i++)
printf("%d %d from %d, %d\n", i, losers[i].keyp, losers[i].source, losers[i].sup);
}
inline int
get_min_source()
{ return losers[0].source; }
......@@ -664,8 +650,9 @@ namespace __gnu_parallel
{
unsigned int left = init_winner (2 * root);
unsigned int right = init_winner (2 * root + 1);
if ( losers[right].sup ||
(!losers[left].sup && !comp(*losers[right].keyp, *losers[left].keyp)))
if (losers[right].sup
|| (!losers[left].sup
&& !comp(*losers[right].keyp, *losers[left].keyp)))
{
// Left one is less or equal.
losers[root] = losers[right];
......@@ -773,13 +760,13 @@ namespace __gnu_parallel
#if _GLIBCXX_LOSER_TREE_UNGUARDED
/** @brief Unguarded loser tree, copying the whole element into the
* tree structure.
*
* No guarding is done, therefore not a single input sequence must
* run empty. This is a very fast variant.
*/
template<typename T, typename Comparator = std::less<T> >
/** @brief Unguarded loser tree, copying the whole element into the
* tree structure.
*
* No guarding is done, therefore not a single input sequence must
* run empty. This is a very fast variant.
*/
template<typename T, typename Comparator = std::less<T> >
class LoserTreeUnguarded
{
private:
......@@ -809,7 +796,9 @@ namespace __gnu_parallel
}
public:
inline LoserTreeUnguarded(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp)
inline
LoserTreeUnguarded(unsigned int _k, Comparator _comp = std::less<T>())
: comp(_comp)
{
ik = _k;
// Next greater or equal power of 2.
......@@ -826,13 +815,6 @@ namespace __gnu_parallel
delete[] mapping;
}
void
print()
{
for (unsigned int i = 0; i < k + ik; i++)
printf("%d %d from %d\n", i, losers[i].key, losers[i].source);
}
inline int
get_min_source()
{ return losers[0].source; }
......@@ -855,7 +837,8 @@ namespace __gnu_parallel
// Next greater or equal power of 2.
unsigned int division = 1 << (log2(end - begin - 1));
unsigned int left = init_winner(2 * root, begin, begin + division);
unsigned int right = init_winner(2 * root + 1, begin + division, end);
unsigned int right =
init_winner(2 * root + 1, begin + division, end);
if (!comp(losers[right].key, losers[left].key))
{
// Left one is less or equal.
......@@ -912,7 +895,8 @@ namespace __gnu_parallel
{
// The smaller one gets promoted, ties are broken by source.
if (comp(losers[pos].key, keyr)
|| (!comp(keyr, losers[pos].key) && losers[pos].source < source))
|| (!comp(keyr, losers[pos].key)
&& losers[pos].source < source))
{
// The other one is smaller.
std::swap(losers[pos].source, source);
......@@ -926,13 +910,13 @@ namespace __gnu_parallel
#if _GLIBCXX_LOSER_TREE_POINTER_UNGUARDED
/** @brief Unguarded loser tree, keeping only pointers to the
* elements in the tree structure.
*
* No guarding is done, therefore not a single input sequence must
* run empty. This is a very fast variant.
*/
template<typename T, typename Comparator = std::less<T> >
/** @brief Unguarded loser tree, keeping only pointers to the
* elements in the tree structure.
*
* No guarding is done, therefore not a single input sequence must
* run empty. This is a very fast variant.
*/
template<typename T, typename Comparator = std::less<T> >
class LoserTreePointerUnguarded
{
private:
......@@ -961,7 +945,10 @@ namespace __gnu_parallel
}
public:
inline LoserTreePointerUnguarded(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp)
inline
LoserTreePointerUnguarded(unsigned int _k,
Comparator _comp = std::less<T>())
: comp(_comp)
{
ik = _k;
......@@ -979,13 +966,6 @@ namespace __gnu_parallel
delete[] mapping;
}
void
print()
{
for (unsigned int i = 0; i < k + ik; i++)
printf("%d %d from %d\n", i, *losers[i].keyp, losers[i].source);
}
inline int
get_min_source()
{ return losers[0].source; }
......@@ -1008,7 +988,8 @@ namespace __gnu_parallel
// Next greater or equal power of 2.
unsigned int division = 1 << (log2(end - begin - 1));
unsigned int left = init_winner(2 * root, begin, begin + division);
unsigned int right = init_winner(2 * root + 1, begin + division, end);
unsigned int right
= init_winner(2 * root + 1, begin + division, end);
if (!comp(*losers[right].keyp, *losers[left].keyp))
{
// Left one is less or equal.
......@@ -1079,7 +1060,7 @@ namespace __gnu_parallel
};
#endif
template<typename _ValueTp, class Comparator>
template<typename _ValueTp, class Comparator>
struct loser_tree_traits
{
#if _GLIBCXX_LOSER_TREE
......@@ -1093,7 +1074,7 @@ namespace __gnu_parallel
#endif
};
template<typename _ValueTp, class Comparator>
template<typename _ValueTp, class Comparator>
struct loser_tree_unguarded_traits
{
#if _GLIBCXX_LOSER_TREE_UNGUARDED
......
......@@ -29,16 +29,16 @@
// Public License.
/** @file parallel/multiway_merge.h
* @brief Implementation of sequential and parallel multiway merge.
*
* Explanations on the high-speed merging routines in the appendix of
*
* P. Sanders.
* Fast priority queues for cached memory.
* ACM Journal of Experimental Algorithmics, 5, 2000.
*
* This file is a GNU parallel extension to the Standard C++ Library.
*/
* @brief Implementation of sequential and parallel multiway merge.
*
* Explanations on the high-speed merging routines in the appendix of
*
* P. Sanders.
* Fast priority queues for cached memory.
* ACM Journal of Experimental Algorithmics, 5, 2000.
*
* This file is a GNU parallel extension to the Standard C++ Library.
*/
// Written by Johannes Singler.
......@@ -62,15 +62,15 @@
// XXX need iterator typedefs
namespace __gnu_parallel
{
template<typename RandomAccessIterator, typename Comparator>
template<typename RandomAccessIterator, typename Comparator>
class guarded_iterator;
template<typename RandomAccessIterator, typename Comparator>
template<typename RandomAccessIterator, typename Comparator>
inline bool
operator<(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
guarded_iterator<RandomAccessIterator, Comparator>& bi2);
template<typename RandomAccessIterator, typename Comparator>
template<typename RandomAccessIterator, typename Comparator>
inline bool
operator<=(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
guarded_iterator<RandomAccessIterator, Comparator>& bi2);
......@@ -80,7 +80,7 @@ namespace __gnu_parallel
* Deriving from RandomAccessIterator is not possible since
* RandomAccessIterator need not be a class.
*/
template<typename RandomAccessIterator, typename Comparator>
template<typename RandomAccessIterator, typename Comparator>
class guarded_iterator
{
private:
......@@ -125,17 +125,21 @@ namespace __gnu_parallel
{ return current; }
friend bool
operator< <RandomAccessIterator, Comparator>(guarded_iterator<RandomAccessIterator, Comparator>& bi1, guarded_iterator<RandomAccessIterator, Comparator>& bi2);
operator< <RandomAccessIterator, Comparator>(
guarded_iterator<RandomAccessIterator, Comparator>& bi1,
guarded_iterator<RandomAccessIterator, Comparator>& bi2);
friend bool
operator<= <RandomAccessIterator, Comparator>(guarded_iterator<RandomAccessIterator, Comparator>& bi1, guarded_iterator<RandomAccessIterator, Comparator>& bi2);
operator<= <RandomAccessIterator, Comparator>(
guarded_iterator<RandomAccessIterator, Comparator>& bi1,
guarded_iterator<RandomAccessIterator, Comparator>& bi2);
};
/** @brief Compare two elements referenced by guarded iterators.
/** @brief Compare two elements referenced by guarded iterators.
* @param bi1 First iterator.
* @param bi2 Second iterator.
* @return @c True if less. */
template<typename RandomAccessIterator, typename Comparator>
template<typename RandomAccessIterator, typename Comparator>
inline bool
operator<(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
guarded_iterator<RandomAccessIterator, Comparator>& bi2)
......@@ -147,11 +151,11 @@ namespace __gnu_parallel
return (bi1.comp)(*bi1, *bi2); //normal compare
}
/** @brief Compare two elements referenced by guarded iterators.
/** @brief Compare two elements referenced by guarded iterators.
* @param bi1 First iterator.
* @param bi2 Second iterator.
* @return @c True if less equal. */
template<typename RandomAccessIterator, typename Comparator>
template<typename RandomAccessIterator, typename Comparator>
inline bool
operator<=(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
guarded_iterator<RandomAccessIterator, Comparator>& bi2)
......@@ -163,20 +167,20 @@ namespace __gnu_parallel
return !(bi1.comp)(*bi2, *bi1); //normal compare
}
template<typename RandomAccessIterator, typename Comparator>
template<typename RandomAccessIterator, typename Comparator>
class unguarded_iterator;
template<typename RandomAccessIterator, typename Comparator>
template<typename RandomAccessIterator, typename Comparator>
inline bool
operator<(unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
template<typename RandomAccessIterator, typename Comparator>
template<typename RandomAccessIterator, typename Comparator>
inline bool
operator<=(unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
template<typename RandomAccessIterator, typename Comparator>
template<typename RandomAccessIterator, typename Comparator>
class unguarded_iterator
{
private:
......@@ -217,17 +221,21 @@ namespace __gnu_parallel
{ return current; }
friend bool
operator< <RandomAccessIterator, Comparator>(unguarded_iterator<RandomAccessIterator, Comparator>& bi1, unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
operator< <RandomAccessIterator, Comparator>(
unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
friend bool
operator<= <RandomAccessIterator, Comparator>(unguarded_iterator<RandomAccessIterator, Comparator>& bi1, unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
operator<= <RandomAccessIterator, Comparator>(
unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
};
/** @brief Compare two elements referenced by unguarded iterators.
/** @brief Compare two elements referenced by unguarded iterators.
* @param bi1 First iterator.
* @param bi2 Second iterator.
* @return @c True if less. */
template<typename RandomAccessIterator, typename Comparator>
template<typename RandomAccessIterator, typename Comparator>
inline bool
operator<(unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
unguarded_iterator<RandomAccessIterator, Comparator>& bi2)
......@@ -236,11 +244,11 @@ namespace __gnu_parallel
return (bi1.comp)(*bi1, *bi2);
}
/** @brief Compare two elements referenced by unguarded iterators.
/** @brief Compare two elements referenced by unguarded iterators.
* @param bi1 First iterator.
* @param bi2 Second iterator.
* @return @c True if less equal. */
template<typename RandomAccessIterator, typename Comparator>
template<typename RandomAccessIterator, typename Comparator>
inline bool
operator<=(unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
unguarded_iterator<RandomAccessIterator, Comparator>& bi2)
......@@ -249,26 +257,30 @@ namespace __gnu_parallel
return !(bi1.comp)(*bi2, *bi1);
}
/** Prepare a set of sequences to be merged without a (end) guard
/** Prepare a set of sequences to be merged without a (end) guard
* @param seqs_begin
* @param seqs_end
* @param comp
* @param min_sequence
* @param stable
* @pre (seqs_end - seqs_begin > 0) */
template<typename RandomAccessIteratorIterator, typename Comparator>
typename std::iterator_traits<typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type>::difference_type
template<typename RandomAccessIteratorIterator, typename Comparator>
typename std::iterator_traits<
typename std::iterator_traits<RandomAccessIteratorIterator>::value_type
::first_type>::difference_type
prepare_unguarded(RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end, Comparator comp,
int& min_sequence, bool stable)
{
_GLIBCXX_CALL(seqs_end - seqs_begin)
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
typedef typename std::iterator_traits<RandomAccessIteratorIterator>
::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type;
typedef typename std::iterator_traits<RandomAccessIterator1>::difference_type
typedef typename std::iterator_traits<RandomAccessIterator1>
::difference_type
difference_type;
if ((*seqs_begin).first == (*seqs_begin).second)
......@@ -317,7 +329,8 @@ namespace __gnu_parallel
for (; s < (seqs_end - seqs_begin); s++)
{
RandomAccessIterator1 split = std::lower_bound(seqs_begin[s].first, seqs_begin[s].second, min, comp);
RandomAccessIterator1 split = std::lower_bound(
seqs_begin[s].first, seqs_begin[s].second, min, comp);
overhang_size += seqs_begin[s].second - split;
}
......@@ -325,23 +338,27 @@ namespace __gnu_parallel
return overhang_size;
}
/** Prepare a set of sequences to be merged with a (end) guard (sentinel)
/** Prepare a set of sequences to be merged with a (end) guard (sentinel)
* @param seqs_begin
* @param seqs_end
* @param comp */
template<typename RandomAccessIteratorIterator, typename Comparator>
typename std::iterator_traits<typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type>::difference_type
template<typename RandomAccessIteratorIterator, typename Comparator>
typename std::iterator_traits<typename std::iterator_traits<
RandomAccessIteratorIterator>::value_type::first_type>::difference_type
prepare_unguarded_sentinel(RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end,
Comparator comp)
{
_GLIBCXX_CALL(seqs_end - seqs_begin)
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
typedef typename std::iterator_traits<RandomAccessIteratorIterator>
::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
typedef typename std::iterator_traits<RandomAccessIterator1>
::value_type
value_type;
typedef typename std::iterator_traits<RandomAccessIterator1>::difference_type
typedef typename std::iterator_traits<RandomAccessIterator1>
::difference_type
difference_type;
// Last element in sequence.
......@@ -362,8 +379,8 @@ namespace __gnu_parallel
difference_type overhang_size = 0;
for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; s++)
{
RandomAccessIterator1 split = std::lower_bound((*s).first, (*s).second,
*max, comp);
RandomAccessIterator1 split =
std::lower_bound((*s).first, (*s).second, *max, comp);
overhang_size += (*s).second - split;
// Set sentinel.
......@@ -374,7 +391,7 @@ namespace __gnu_parallel
return overhang_size;
}
/** @brief Highly efficient 3-way merging procedure.
/** @brief Highly efficient 3-way merging procedure.
* @param seqs_begin Begin iterator of iterator pair input sequence.
* @param seqs_end End iterator of iterator pair input sequence.
* @param target Begin iterator out output sequence.
......@@ -382,15 +399,25 @@ namespace __gnu_parallel
* @param length Maximum length to merge.
* @param stable Unused, stable anyway.
* @return End iterator of output sequence. */
template<template<typename RAI, typename C> class iterator, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
template<
template<typename RAI, typename C> class iterator,
typename RandomAccessIteratorIterator,
typename RandomAccessIterator3,
typename _DifferenceTp,
typename Comparator>
RandomAccessIterator3
multiway_merge_3_variant(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
multiway_merge_3_variant(
RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end,
RandomAccessIterator3 target,
Comparator comp, _DifferenceTp length, bool stable)
{
_GLIBCXX_CALL(length);
typedef _DifferenceTp difference_type;
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
typedef typename std::iterator_traits<RandomAccessIteratorIterator>
::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type;
......@@ -456,14 +483,23 @@ namespace __gnu_parallel
return target;
}
template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
template<
typename RandomAccessIteratorIterator,
typename RandomAccessIterator3,
typename _DifferenceTp,
typename Comparator>
RandomAccessIterator3
multiway_merge_3_combined(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
multiway_merge_3_combined(RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end,
RandomAccessIterator3 target,
Comparator comp,
_DifferenceTp length, bool stable)
{
_GLIBCXX_CALL(length);
typedef _DifferenceTp difference_type;
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
typedef typename std::iterator_traits<RandomAccessIteratorIterator>
::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type;
......@@ -472,7 +508,8 @@ namespace __gnu_parallel
RandomAccessIterator3 target_end;
// Stable anyway.
difference_type overhang = prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true);
difference_type overhang =
prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true);
difference_type total_length = 0;
for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; ++s)
......@@ -480,7 +517,8 @@ namespace __gnu_parallel
if (overhang != -1)
{
difference_type unguarded_length = std::min(length, total_length - overhang);
difference_type unguarded_length =
std::min(length, total_length - overhang);
target_end = multiway_merge_3_variant<unguarded_iterator>
(seqs_begin, seqs_end, target, comp, unguarded_length, stable);
overhang = length - unguarded_length;
......@@ -527,7 +565,7 @@ namespace __gnu_parallel
return target_end;
}
/** @brief Highly efficient 4-way merging procedure.
/** @brief Highly efficient 4-way merging procedure.
* @param seqs_begin Begin iterator of iterator pair input sequence.
* @param seqs_end End iterator of iterator pair input sequence.
* @param target Begin iterator out output sequence.
......@@ -535,14 +573,23 @@ namespace __gnu_parallel
* @param length Maximum length to merge.
* @param stable Unused, stable anyway.
* @return End iterator of output sequence. */
template<template<typename RAI, typename C> class iterator, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
template<
template<typename RAI, typename C> class iterator,
typename RandomAccessIteratorIterator,
typename RandomAccessIterator3,
typename _DifferenceTp,
typename Comparator>
RandomAccessIterator3
multiway_merge_4_variant(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
multiway_merge_4_variant(RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end,
RandomAccessIterator3 target,
Comparator comp, _DifferenceTp length, bool stable)
{
_GLIBCXX_CALL(length);
typedef _DifferenceTp difference_type;
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
typedef typename std::iterator_traits<RandomAccessIteratorIterator>
::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type;
......@@ -633,14 +680,23 @@ namespace __gnu_parallel
return target;
}
template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
template<
typename RandomAccessIteratorIterator,
typename RandomAccessIterator3,
typename _DifferenceTp,
typename Comparator>
RandomAccessIterator3
multiway_merge_4_combined(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
multiway_merge_4_combined(RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end,
RandomAccessIterator3 target,
Comparator comp,
_DifferenceTp length, bool stable)
{
_GLIBCXX_CALL(length);
typedef _DifferenceTp difference_type;
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
typedef typename std::iterator_traits<RandomAccessIteratorIterator>
::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type;
......@@ -649,7 +705,8 @@ namespace __gnu_parallel
RandomAccessIterator3 target_end;
// Stable anyway.
difference_type overhang = prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true);
difference_type overhang =
prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true);
difference_type total_length = 0;
for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; ++s)
......@@ -657,7 +714,8 @@ namespace __gnu_parallel
if (overhang != -1)
{
difference_type unguarded_length = std::min(length, total_length - overhang);
difference_type unguarded_length =
std::min(length, total_length - overhang);
target_end = multiway_merge_4_variant<unguarded_iterator>
(seqs_begin, seqs_end, target, comp, unguarded_length, stable);
overhang = length - unguarded_length;
......@@ -674,10 +732,13 @@ namespace __gnu_parallel
_GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target_end, comp));
#endif
std::vector<std::pair<RandomAccessIterator1, RandomAccessIterator1> > one_missing(seqs_begin, seqs_end);
std::vector<std::pair<RandomAccessIterator1, RandomAccessIterator1> >
one_missing(seqs_begin, seqs_end);
one_missing.erase(one_missing.begin() + min_seq); //remove
target_end = multiway_merge_3_variant<guarded_iterator>(one_missing.begin(), one_missing.end(), target_end, comp, overhang, stable);
target_end = multiway_merge_3_variant<guarded_iterator>(
one_missing.begin(), one_missing.end(),
target_end, comp, overhang, stable);
// Insert back again.
one_missing.insert(one_missing.begin() + min_seq, seqs_begin[min_seq]);
......@@ -692,7 +753,7 @@ namespace __gnu_parallel
return target_end;
}
/** @brief Basic multi-way merging procedure.
/** @brief Basic multi-way merging procedure.
*
* The head elements are kept in a sorted array, new heads are
* inserted linearly.
......@@ -704,14 +765,22 @@ namespace __gnu_parallel
* @param stable Stable merging incurs a performance penalty.
* @return End iterator of output sequence.
*/
template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
template<
typename RandomAccessIteratorIterator,
typename RandomAccessIterator3,
typename _DifferenceTp,
typename Comparator>
RandomAccessIterator3
multiway_merge_bubble(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
multiway_merge_bubble(RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end,
RandomAccessIterator3 target,
Comparator comp, _DifferenceTp length, bool stable)
{
_GLIBCXX_CALL(length)
typedef _DifferenceTp difference_type;
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
typedef typename std::iterator_traits<RandomAccessIteratorIterator>
::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type;
......@@ -719,7 +788,8 @@ namespace __gnu_parallel
// Num remaining pieces.
int k = static_cast<int>(seqs_end - seqs_begin), nrp;
value_type* pl = static_cast<value_type*>(::operator new(sizeof(value_type) * k));
value_type* pl = static_cast<value_type*>(
::operator new(sizeof(value_type) * k));
int* source = new int[k];
difference_type total_length = 0;
......@@ -818,7 +888,8 @@ namespace __gnu_parallel
// Sink down.
j = 1;
while ((j < nrp) && (comp(pl[j], pl[j - 1]) ||
(!comp(pl[j - 1], pl[j]) && (source[j] < source[j - 1]))))
(!comp(pl[j - 1], pl[j])
&& (source[j] < source[j - 1]))))
{
std::swap(pl[j - 1], pl[j]);
std::swap(source[j - 1], source[j]);
......@@ -869,7 +940,7 @@ namespace __gnu_parallel
return target;
}
/** @brief Multi-way merging procedure for a high branching factor,
/** @brief Multi-way merging procedure for a high branching factor,
* guarded case.
*
* The head elements are kept in a loser tree.
......@@ -881,14 +952,24 @@ namespace __gnu_parallel
* @param stable Stable merging incurs a performance penalty.
* @return End iterator of output sequence.
*/
template<typename LT, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
template<
typename LT,
typename RandomAccessIteratorIterator,
typename RandomAccessIterator3,
typename _DifferenceTp,
typename Comparator>
RandomAccessIterator3
multiway_merge_loser_tree(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
multiway_merge_loser_tree(RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end,
RandomAccessIterator3 target,
Comparator comp,
_DifferenceTp length, bool stable)
{
_GLIBCXX_CALL(length)
typedef _DifferenceTp difference_type;
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
typedef typename std::iterator_traits<RandomAccessIteratorIterator>
::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type;
......@@ -978,7 +1059,7 @@ namespace __gnu_parallel
return target;
}
/** @brief Multi-way merging procedure for a high branching factor,
/** @brief Multi-way merging procedure for a high branching factor,
* unguarded case.
*
* The head elements are kept in a loser tree.
......@@ -991,14 +1072,23 @@ namespace __gnu_parallel
* @return End iterator of output sequence.
* @pre No input will run out of elements during the merge.
*/
template<typename LT, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
template<
typename LT,
typename RandomAccessIteratorIterator,
typename RandomAccessIterator3,
typename _DifferenceTp, typename Comparator>
RandomAccessIterator3
multiway_merge_loser_tree_unguarded(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
multiway_merge_loser_tree_unguarded(RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end,
RandomAccessIterator3 target,
Comparator comp,
_DifferenceTp length, bool stable)
{
_GLIBCXX_CALL(length)
typedef _DifferenceTp difference_type;
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
typedef typename std::iterator_traits<RandomAccessIteratorIterator>
::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type;
......@@ -1045,13 +1135,16 @@ namespace __gnu_parallel
source = lt.get_min_source();
#if _GLIBCXX_ASSERTIONS
_GLIBCXX_PARALLEL_ASSERT(i == 0 || !comp(*(seqs_begin[source].first), *(target - 1)));
_GLIBCXX_PARALLEL_ASSERT(i == 0
|| !comp(*(seqs_begin[source].first), *(target - 1)));
#endif
*(target++) = *(seqs_begin[source].first++);
#if _GLIBCXX_ASSERTIONS
_GLIBCXX_PARALLEL_ASSERT((seqs_begin[source].first != seqs_begin[source].second) || (i == length - 1));
_GLIBCXX_PARALLEL_ASSERT(
(seqs_begin[source].first != seqs_begin[source].second)
|| (i == length - 1));
i++;
#endif
// Feed.
......@@ -1071,15 +1164,19 @@ namespace __gnu_parallel
#if _GLIBCXX_ASSERTIONS
if (i > 0 && comp(*(seqs_begin[source].first), *(target - 1)))
printf(" %i %i %i\n", length, i, source);
_GLIBCXX_PARALLEL_ASSERT(i == 0 || !comp(*(seqs_begin[source].first), *(target - 1)));
_GLIBCXX_PARALLEL_ASSERT(i == 0
|| !comp(*(seqs_begin[source].first), *(target - 1)));
#endif
*(target++) = *(seqs_begin[source].first++);
#if _GLIBCXX_ASSERTIONS
if (!((seqs_begin[source].first != seqs_begin[source].second) || (i >= length - 1)))
if (!((seqs_begin[source].first != seqs_begin[source].second)
|| (i >= length - 1)))
printf(" %i %i %i\n", length, i, source);
_GLIBCXX_PARALLEL_ASSERT((seqs_begin[source].first != seqs_begin[source].second) || (i >= length - 1));
_GLIBCXX_PARALLEL_ASSERT(
(seqs_begin[source].first != seqs_begin[source].second)
|| (i >= length - 1));
i++;
#endif
// Feed.
......@@ -1091,15 +1188,24 @@ namespace __gnu_parallel
return target;
}
template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
template<
typename RandomAccessIteratorIterator,
typename RandomAccessIterator3,
typename _DifferenceTp,
typename Comparator>
RandomAccessIterator3
multiway_merge_loser_tree_combined(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
multiway_merge_loser_tree_combined(RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end,
RandomAccessIterator3 target,
Comparator comp,
_DifferenceTp length, bool stable)
{
_GLIBCXX_CALL(length)
typedef _DifferenceTp difference_type;
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
typedef typename std::iterator_traits<RandomAccessIteratorIterator>
::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type;
......@@ -1115,7 +1221,8 @@ namespace __gnu_parallel
if (overhang != -1)
{
difference_type unguarded_length = std::min(length, total_length - overhang);
difference_type unguarded_length =
std::min(length, total_length - overhang);
target_end = multiway_merge_loser_tree_unguarded
<typename loser_tree_unguarded_traits<value_type, Comparator>::LT>
(seqs_begin, seqs_end, target, comp, unguarded_length, stable);
......@@ -1145,23 +1252,31 @@ namespace __gnu_parallel
return target_end;
}
template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
template<
typename RandomAccessIteratorIterator,
typename RandomAccessIterator3,
typename _DifferenceTp,
typename Comparator>
RandomAccessIterator3
multiway_merge_loser_tree_sentinel(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
multiway_merge_loser_tree_sentinel(RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end,
RandomAccessIterator3 target,
Comparator comp,
_DifferenceTp length, bool stable)
{
_GLIBCXX_CALL(length)
typedef _DifferenceTp difference_type;
typedef std::iterator_traits<RandomAccessIteratorIterator> traits_type;
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
typedef typename std::iterator_traits<RandomAccessIteratorIterator>
::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type;
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
RandomAccessIterator1;
RandomAccessIterator3 target_end;
difference_type overhang = prepare_unguarded_sentinel(seqs_begin, seqs_end, comp);
difference_type overhang =
prepare_unguarded_sentinel(seqs_begin, seqs_end, comp);
difference_type total_length = 0;
for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; s++)
......@@ -1172,7 +1287,8 @@ namespace __gnu_parallel
(*s).second++;
}
difference_type unguarded_length = std::min(length, total_length - overhang);
difference_type unguarded_length =
std::min(length, total_length - overhang);
target_end = multiway_merge_loser_tree_unguarded
<typename loser_tree_unguarded_traits<value_type, Comparator>::LT>
(seqs_begin, seqs_end, target, comp, unguarded_length, stable);
......@@ -1184,12 +1300,15 @@ namespace __gnu_parallel
#endif
// Copy rest stable.
for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end && overhang > 0; s++)
for (RandomAccessIteratorIterator s = seqs_begin;
s != seqs_end && overhang > 0; s++)
{
// Restore.
(*s).second--;
difference_type local_length = std::min((difference_type)overhang, (difference_type)LENGTH(*s));
target_end = std::copy((*s).first, (*s).first + local_length, target_end);
difference_type local_length =
std::min<difference_type>(overhang, LENGTH(*s));
target_end = std::copy((*s).first, (*s).first + local_length,
target_end);
(*s).first += local_length;
overhang -= local_length;
}
......@@ -1203,7 +1322,7 @@ namespace __gnu_parallel
return target_end;
}
/** @brief Sequential multi-way merging switch.
/** @brief Sequential multi-way merging switch.
*
* The decision if based on the branching factor and runtime settings.
* @param seqs_begin Begin iterator of iterator pair input sequence.
......@@ -1214,14 +1333,24 @@ namespace __gnu_parallel
* @param stable Stable merging incurs a performance penalty.
* @param sentinel The sequences have a sentinel element.
* @return End iterator of output sequence. */
template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
template<
typename RandomAccessIteratorIterator,
typename RandomAccessIterator3,
typename _DifferenceTp,
typename Comparator>
RandomAccessIterator3
multiway_merge(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable, bool sentinel, sequential_tag)
multiway_merge(RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end,
RandomAccessIterator3 target,
Comparator comp, _DifferenceTp length,
bool stable, bool sentinel,
sequential_tag)
{
_GLIBCXX_CALL(length)
typedef _DifferenceTp difference_type;
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
typedef typename std::iterator_traits<RandomAccessIteratorIterator>
::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type;
......@@ -1234,7 +1363,8 @@ namespace __gnu_parallel
RandomAccessIterator3 return_target = target;
int k = static_cast<int>(seqs_end - seqs_begin);
Settings::MultiwayMergeAlgorithm mwma = Settings::multiway_merge_algorithm;
Settings::MultiwayMergeAlgorithm mwma =
Settings::multiway_merge_algorithm;
if (!sentinel && mwma == Settings::LOSER_TREE_SENTINEL)
mwma = Settings::LOSER_TREE_COMBINED;
......@@ -1244,23 +1374,40 @@ namespace __gnu_parallel
case 0:
break;
case 1:
return_target = std::copy(seqs_begin[0].first, seqs_begin[0].first + length, target);
return_target = std::copy(seqs_begin[0].first,
seqs_begin[0].first + length,
target);
seqs_begin[0].first += length;
break;
case 2:
return_target = merge_advance(seqs_begin[0].first, seqs_begin[0].second, seqs_begin[1].first, seqs_begin[1].second, target, length, comp);
return_target = merge_advance(seqs_begin[0].first,
seqs_begin[0].second,
seqs_begin[1].first,
seqs_begin[1].second,
target, length, comp);
break;
case 3:
switch (mwma)
{
case Settings::LOSER_TREE_COMBINED:
return_target = multiway_merge_3_combined(seqs_begin, seqs_end, target, comp, length, stable);
return_target = multiway_merge_3_combined(seqs_begin,
seqs_end,
target,
comp, length, stable);
break;
case Settings::LOSER_TREE_SENTINEL:
return_target = multiway_merge_3_variant<unguarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable);
return_target = multiway_merge_3_variant<unguarded_iterator>(
seqs_begin,
seqs_end,
target,
comp, length, stable);
break;
default:
return_target = multiway_merge_3_variant<guarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable);
return_target = multiway_merge_3_variant<guarded_iterator>(
seqs_begin,
seqs_end,
target,
comp, length, stable);
break;
}
break;
......@@ -1268,13 +1415,25 @@ namespace __gnu_parallel
switch (mwma)
{
case Settings::LOSER_TREE_COMBINED:
return_target = multiway_merge_4_combined(seqs_begin, seqs_end, target, comp, length, stable);
return_target = multiway_merge_4_combined(
seqs_begin,
seqs_end,
target,
comp, length, stable);
break;
case Settings::LOSER_TREE_SENTINEL:
return_target = multiway_merge_4_variant<unguarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable);
return_target = multiway_merge_4_variant<unguarded_iterator>(
seqs_begin,
seqs_end,
target,
comp, length, stable);
break;
default:
return_target = multiway_merge_4_variant<guarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable);
return_target = multiway_merge_4_variant<guarded_iterator>(
seqs_begin,
seqs_end,
target,
comp, length, stable);
break;
}
break;
......@@ -1283,26 +1442,48 @@ namespace __gnu_parallel
switch (mwma)
{
case Settings::BUBBLE:
return_target = multiway_merge_bubble(seqs_begin, seqs_end, target, comp, length, stable);
return_target = multiway_merge_bubble(
seqs_begin,
seqs_end,
target,
comp, length, stable);
break;
#if _GLIBCXX_LOSER_TREE_EXPLICIT
case Settings::LOSER_TREE_EXPLICIT:
return_target = multiway_merge_loser_tree<LoserTreeExplicit<value_type, Comparator> >(seqs_begin, seqs_end, target, comp, length, stable);
return_target = multiway_merge_loser_tree<
LoserTreeExplicit<value_type, Comparator> >(
seqs_begin,
seqs_end,
target,
comp, length, stable);
break;
#endif
#if _GLIBCXX_LOSER_TREE
case Settings::LOSER_TREE:
return_target = multiway_merge_loser_tree<LoserTree<value_type, Comparator> >(seqs_begin, seqs_end, target, comp, length, stable);
return_target = multiway_merge_loser_tree<
LoserTree<value_type, Comparator> >(
seqs_begin,
seqs_end,
target,
comp, length, stable);
break;
#endif
#if _GLIBCXX_LOSER_TREE_COMBINED
case Settings::LOSER_TREE_COMBINED:
return_target = multiway_merge_loser_tree_combined(seqs_begin, seqs_end, target, comp, length, stable);
return_target = multiway_merge_loser_tree_combined(
seqs_begin,
seqs_end,
target,
comp, length, stable);
break;
#endif
#if _GLIBCXX_LOSER_TREE_SENTINEL
case Settings::LOSER_TREE_SENTINEL:
return_target = multiway_merge_loser_tree_sentinel(seqs_begin, seqs_end, target, comp, length, stable);
return_target = multiway_merge_loser_tree_sentinel(
seqs_begin,
seqs_end,
target,
comp, length, stable);
break;
#endif
default:
......@@ -1319,7 +1500,7 @@ namespace __gnu_parallel
return return_target;
}
/** @brief Parallel multi-way merge routine.
/** @brief Parallel multi-way merge routine.
*
* The decision if based on the branching factor and runtime settings.
* @param seqs_begin Begin iterator of iterator pair input sequence.
......@@ -1331,28 +1512,33 @@ namespace __gnu_parallel
* @param sentinel Ignored.
* @return End iterator of output sequence.
*/
template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
template<
typename RandomAccessIteratorIterator,
typename RandomAccessIterator3,
typename _DifferenceTp,
typename Comparator>
RandomAccessIterator3
parallel_multiway_merge(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable, bool sentinel)
parallel_multiway_merge(RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end,
RandomAccessIterator3 target,
Comparator comp,
_DifferenceTp length, bool stable, bool sentinel)
{
_GLIBCXX_CALL(length)
typedef _DifferenceTp difference_type;
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
typedef typename std::iterator_traits<RandomAccessIteratorIterator>
::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type;
#if _GLIBCXX_ASSERTIONS
for (RandomAccessIteratorIterator rii = seqs_begin; rii != seqs_end; rii++)
_GLIBCXX_PARALLEL_ASSERT(is_sorted((*rii).first, (*rii).second, comp));
#endif
// k sequences.
int k = static_cast<int>(seqs_end - seqs_begin);
difference_type total_length = 0;
for (RandomAccessIteratorIterator raii = seqs_begin; raii != seqs_end; raii++)
for (RandomAccessIteratorIterator raii = seqs_begin;
raii != seqs_end; raii++)
total_length += LENGTH(*raii);
_GLIBCXX_CALL(total_length)
......@@ -1360,32 +1546,50 @@ namespace __gnu_parallel
if (total_length == 0 || k == 0)
return target;
thread_index_t num_threads = static_cast<thread_index_t>(std::min(static_cast<difference_type>(get_max_threads()), total_length));
bool tight = (total_length == length);
std::vector<std::pair<difference_type, difference_type> >* pieces;
thread_index_t num_threads = static_cast<thread_index_t>(
std::min<difference_type>(get_max_threads(), total_length));
# pragma omp parallel num_threads (num_threads)
{
# pragma omp single
{
num_threads = omp_get_num_threads();
// Thread t will have to merge pieces[iam][0..k - 1]
std::vector<std::pair<difference_type, difference_type> >* pieces = new std::vector<std::pair<difference_type, difference_type> >[num_threads];
pieces = new std::vector<
std::pair<difference_type, difference_type> >[num_threads];
for (int s = 0; s < num_threads; s++)
pieces[s].resize(k);
difference_type num_samples = Settings::merge_oversampling * num_threads;
difference_type num_samples =
Settings::merge_oversampling * num_threads;
if (Settings::multiway_merge_splitting == Settings::SAMPLING)
{
value_type* samples = static_cast<value_type*>(::operator new(sizeof(value_type) * k * num_samples));
value_type* samples = static_cast<value_type*>(
::operator new(sizeof(value_type) * k * num_samples));
// Sample.
for (int s = 0; s < k; s++)
for (int i = 0; (difference_type)i < num_samples; i++)
{
difference_type sample_index = static_cast<difference_type>(LENGTH(seqs_begin[s]) * (double(i + 1) / (num_samples + 1)) * (double(length) / total_length));
samples[s * num_samples + i] = seqs_begin[s].first[sample_index];
difference_type sample_index =
static_cast<difference_type>(
LENGTH(seqs_begin[s]) * (double(i + 1) /
(num_samples + 1)) * (double(length)
/ total_length));
samples[s * num_samples + i] =
seqs_begin[s].first[sample_index];
}
if (stable)
__gnu_sequential::stable_sort(samples, samples + (num_samples * k), comp);
__gnu_sequential::stable_sort(
samples, samples + (num_samples * k), comp);
else
__gnu_sequential::sort(samples, samples + (num_samples * k), comp);
__gnu_sequential::sort(
samples, samples + (num_samples * k), comp);
for (int slab = 0; slab < num_threads; slab++)
// For each slab / processor.
......@@ -1393,34 +1597,51 @@ namespace __gnu_parallel
{
// For each sequence.
if (slab > 0)
pieces[slab][seq].first = std::upper_bound(seqs_begin[seq].first, seqs_begin[seq].second, samples[num_samples * k * slab / num_threads], comp) - seqs_begin[seq].first;
pieces[slab][seq].first =
std::upper_bound(
seqs_begin[seq].first,
seqs_begin[seq].second,
samples[num_samples * k * slab / num_threads],
comp)
- seqs_begin[seq].first;
else
{
// Absolute beginning.
pieces[slab][seq].first = 0;
}
if ((slab + 1) < num_threads)
pieces[slab][seq].second = std::upper_bound(seqs_begin[seq].first, seqs_begin[seq].second, samples[num_samples * k * (slab + 1) / num_threads], comp) - seqs_begin[seq].first;
pieces[slab][seq].second =
std::upper_bound(
seqs_begin[seq].first,
seqs_begin[seq].second,
samples[num_samples * k * (slab + 1) /
num_threads], comp)
- seqs_begin[seq].first;
else
pieces[slab][seq].second = LENGTH(seqs_begin[seq]); //absolute ending
pieces[slab][seq].second = LENGTH(seqs_begin[seq]);
}
delete[] samples;
}
else
{
// (Settings::multiway_merge_splitting == Settings::EXACT).
std::vector<RandomAccessIterator1>* offsets = new std::vector<RandomAccessIterator1>[num_threads];
std::vector<std::pair<RandomAccessIterator1, RandomAccessIterator1> > se(k);
std::vector<RandomAccessIterator1>* offsets =
new std::vector<RandomAccessIterator1>[num_threads];
std::vector<
std::pair<RandomAccessIterator1, RandomAccessIterator1>
> se(k);
copy(seqs_begin, seqs_end, se.begin());
difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 1)));
difference_type* borders =
new difference_type[num_threads + 1];
equally_split(length, num_threads, borders);
for (int s = 0; s < (num_threads - 1); s++)
{
offsets[s].resize(k);
multiseq_partition(se.begin(), se.end(), borders[s + 1],
multiseq_partition(
se.begin(), se.end(), borders[s + 1],
offsets[s].begin(), comp);
// Last one also needed and available.
......@@ -1446,21 +1667,23 @@ namespace __gnu_parallel
pieces[slab][seq].first = 0;
}
else
pieces[slab][seq].first = pieces[slab - 1][seq].second;
pieces[slab][seq].first =
pieces[slab - 1][seq].second;
if (!tight || slab < (num_threads - 1))
pieces[slab][seq].second = offsets[slab][seq] - seqs_begin[seq].first;
pieces[slab][seq].second =
offsets[slab][seq] - seqs_begin[seq].first;
else
{
// slab == num_threads - 1
pieces[slab][seq].second = LENGTH(seqs_begin[seq]);
pieces[slab][seq].second =
LENGTH(seqs_begin[seq]);
}
}
}
delete[] offsets;
}
} //single
# pragma omp parallel num_threads(num_threads)
{
thread_index_t iam = omp_get_thread_num();
difference_type target_position = 0;
......@@ -1470,16 +1693,21 @@ namespace __gnu_parallel
if (k > 2)
{
std::pair<RandomAccessIterator1, RandomAccessIterator1>* chunks = new std::pair<RandomAccessIterator1, RandomAccessIterator1>[k];
std::pair<RandomAccessIterator1, RandomAccessIterator1>* chunks
= new
std::pair<RandomAccessIterator1, RandomAccessIterator1>[k];
difference_type local_length = 0;
for (int s = 0; s < k; s++)
{
chunks[s] = std::make_pair(seqs_begin[s].first + pieces[iam][s].first, seqs_begin[s].first + pieces[iam][s].second);
chunks[s] = std::make_pair(
seqs_begin[s].first + pieces[iam][s].first,
seqs_begin[s].first + pieces[iam][s].second);
local_length += LENGTH(chunks[s]);
}
multiway_merge(chunks, chunks + k, target + target_position, comp,
multiway_merge(
chunks, chunks + k, target + target_position, comp,
std::min(local_length, length - target_position),
stable, false, sequential_tag());
......@@ -1487,16 +1715,19 @@ namespace __gnu_parallel
}
else if (k == 2)
{
RandomAccessIterator1 begin0 = seqs_begin[0].first + pieces[iam][0].first, begin1 = seqs_begin[1].first + pieces[iam][1].first;
RandomAccessIterator1
begin0 = seqs_begin[0].first + pieces[iam][0].first,
begin1 = seqs_begin[1].first + pieces[iam][1].first;
merge_advance(begin0,
seqs_begin[0].first + pieces[iam][0].second,
begin1,
seqs_begin[1].first + pieces[iam][1].second,
target + target_position,
(pieces[iam][0].second - pieces[iam][0].first) + (pieces[iam][1].second - pieces[iam][1].first),
(pieces[iam][0].second - pieces[iam][0].first) +
(pieces[iam][1].second - pieces[iam][1].first),
comp);
}
}
} //parallel
#if _GLIBCXX_ASSERTIONS
_GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target + length, comp));
......@@ -1511,7 +1742,7 @@ namespace __gnu_parallel
return target + length;
}
/**
/**
* @brief Multi-way merging front-end.
* @param seqs_begin Begin iterator of iterator pair input sequence.
* @param seqs_end End iterator of iterator pair input sequence.
......@@ -1521,7 +1752,11 @@ namespace __gnu_parallel
* @param stable Stable merging incurs a performance penalty.
* @return End iterator of output sequence.
*/
template<typename RandomAccessIteratorPairIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
template<
typename RandomAccessIteratorPairIterator,
typename RandomAccessIterator3,
typename _DifferenceTp,
typename Comparator>
RandomAccessIterator3
multiway_merge(RandomAccessIteratorPairIterator seqs_begin,
RandomAccessIteratorPairIterator seqs_end,
......@@ -1535,15 +1770,21 @@ namespace __gnu_parallel
return target;
RandomAccessIterator3 target_end;
if (_GLIBCXX_PARALLEL_CONDITION(((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k) && ((sequence_index_t)length >= Settings::multiway_merge_minimal_n)))
target_end = parallel_multiway_merge(seqs_begin, seqs_end, target, comp, (difference_type)length, stable, false);
if (_GLIBCXX_PARALLEL_CONDITION(
((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k)
&& ((sequence_index_t)length >= Settings::multiway_merge_minimal_n)))
target_end = parallel_multiway_merge(
seqs_begin, seqs_end,
target, comp, static_cast<difference_type>(length), stable, false);
else
target_end = multiway_merge(seqs_begin, seqs_end, target, comp, length, stable, false, sequential_tag());
target_end = multiway_merge(
seqs_begin, seqs_end,
target, comp, length, stable, false, sequential_tag());
return target_end;
}
/** @brief Multi-way merging front-end.
/** @brief Multi-way merging front-end.
* @param seqs_begin Begin iterator of iterator pair input sequence.
* @param seqs_end End iterator of iterator pair input sequence.
* @param target Begin iterator out output sequence.
......@@ -1554,7 +1795,11 @@ namespace __gnu_parallel
* @pre For each @c i, @c seqs_begin[i].second must be the end
* marker of the sequence, but also reference the one more sentinel
* element. */
template<typename RandomAccessIteratorPairIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
template<
typename RandomAccessIteratorPairIterator,
typename RandomAccessIterator3,
typename _DifferenceTp,
typename Comparator>
RandomAccessIterator3
multiway_merge_sentinel(RandomAccessIteratorPairIterator seqs_begin,
RandomAccessIteratorPairIterator seqs_end,
......@@ -1570,10 +1815,16 @@ namespace __gnu_parallel
_GLIBCXX_CALL(seqs_end - seqs_begin)
if (_GLIBCXX_PARALLEL_CONDITION(((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k) && ((sequence_index_t)length >= Settings::multiway_merge_minimal_n)))
return parallel_multiway_merge(seqs_begin, seqs_end, target, comp, (typename std::iterator_traits<RandomAccessIterator3>::difference_type)length, stable, true);
if (_GLIBCXX_PARALLEL_CONDITION(
((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k)
&& ((sequence_index_t)length >= Settings::multiway_merge_minimal_n)))
return parallel_multiway_merge(
seqs_begin, seqs_end,
target, comp, static_cast<difference_type>(length), stable, true);
else
return multiway_merge(seqs_begin, seqs_end, target, comp, length, stable, true, sequential_tag());
return multiway_merge(
seqs_begin, seqs_end,
target, comp, length, stable, true, sequential_tag());
}
}
......
......@@ -48,8 +48,8 @@
namespace __gnu_parallel
{
/** @brief Subsequence description. */
template<typename _DifferenceTp>
/** @brief Subsequence description. */
template<typename _DifferenceTp>
struct Piece
{
typedef _DifferenceTp difference_type;
......@@ -61,16 +61,19 @@ namespace __gnu_parallel
difference_type end;
};
/** @brief Data accessed by all threads.
/** @brief Data accessed by all threads.
*
* PMWMS = parallel multiway mergesort */
template<typename RandomAccessIterator>
template<typename RandomAccessIterator>
struct PMWMSSortingData
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type;
typedef typename traits_type::difference_type difference_type;
/** @brief Number of threads involved. */
thread_index_t num_threads;
/** @brief Input begin. */
RandomAccessIterator source;
......@@ -105,62 +108,55 @@ namespace __gnu_parallel
/** @brief Pieces of data to merge @c [thread][sequence] */
std::vector<Piece<difference_type> >* pieces;
};
/** @brief Thread local data for PMWMS. */
template<typename RandomAccessIterator>
struct PMWMSSorterPU
{
/** @brief Total number of thread involved. */
thread_index_t num_threads;
/** @brief Number of owning thread. */
thread_index_t iam;
/** @brief Stable sorting desired. */
bool stable;
/** @brief Pointer to global data. */
PMWMSSortingData<RandomAccessIterator>* sd;
};
};
/**
/**
* @brief Select samples from a sequence.
* @param d Pointer to thread-local data. Result will be placed in
* @c d->ds->samples.
* @param sd Pointer to algorithm data. Result will be placed in
* @c sd->samples.
* @param num_samples Number of samples to select.
*/
template<typename RandomAccessIterator, typename _DifferenceTp>
template<typename RandomAccessIterator, typename _DifferenceTp>
inline void
determine_samples(PMWMSSorterPU<RandomAccessIterator>* d,
determine_samples(PMWMSSortingData<RandomAccessIterator>* sd,
_DifferenceTp& num_samples)
{
typedef _DifferenceTp difference_type;
PMWMSSortingData<RandomAccessIterator>* sd = d->sd;
thread_index_t iam = omp_get_thread_num();
num_samples = Settings::sort_mwms_oversampling * d->num_threads - 1;
num_samples =
Settings::sort_mwms_oversampling * sd->num_threads - 1;
difference_type* es = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_samples + 2)));
difference_type* es = new difference_type[num_samples + 2];
equally_split(sd->starts[d->iam + 1] - sd->starts[d->iam], num_samples + 1, es);
equally_split(sd->starts[iam + 1] - sd->starts[iam],
num_samples + 1, es);
for (difference_type i = 0; i < num_samples; i++)
sd->samples[d->iam * num_samples + i] = sd->source[sd->starts[d->iam] + es[i + 1]];
sd->samples[iam * num_samples + i] =
sd->source[sd->starts[iam] + es[i + 1]];
delete[] es;
}
/** @brief PMWMS code executed by each thread.
* @param d Pointer to thread-local data.
/** @brief PMWMS code executed by each thread.
* @param sd Pointer to algorithm data.
* @param comp Comparator.
*/
template<typename RandomAccessIterator, typename Comparator>
template<typename RandomAccessIterator, typename Comparator>
inline void
parallel_sort_mwms_pu(PMWMSSorterPU<RandomAccessIterator>* d,
parallel_sort_mwms_pu(PMWMSSortingData<RandomAccessIterator>* sd,
Comparator& comp)
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type;
typedef typename traits_type::difference_type difference_type;
PMWMSSortingData<RandomAccessIterator>* sd = d->sd;
thread_index_t iam = d->iam;
thread_index_t iam = omp_get_thread_num();
// Length of this thread's chunk, before merging.
difference_type length_local = sd->starts[iam + 1] - sd->starts[iam];
......@@ -174,21 +170,25 @@ namespace __gnu_parallel
typedef value_type* SortingPlacesIterator;
// Sort in temporary storage, leave space for sentinel.
sd->sorting_places[iam] = sd->temporaries[iam] = static_cast<value_type*>(::operator new(sizeof(value_type) * (length_local + 1)));
sd->sorting_places[iam] = sd->temporaries[iam] =
static_cast<value_type*>(
::operator new(sizeof(value_type) * (length_local + 1)));
// Copy there.
std::uninitialized_copy(sd->source + sd->starts[iam], sd->source + sd->starts[iam] + length_local, sd->sorting_places[iam]);
std::uninitialized_copy(sd->source + sd->starts[iam],
sd->source + sd->starts[iam] + length_local,
sd->sorting_places[iam]);
#endif
// Sort locally.
if (d->stable)
__gnu_sequential::stable_sort(sd->sorting_places[iam], sd->sorting_places[iam] + length_local, comp);
if (sd->stable)
__gnu_sequential::stable_sort(sd->sorting_places[iam],
sd->sorting_places[iam] + length_local,
comp);
else
__gnu_sequential::sort(sd->sorting_places[iam], sd->sorting_places[iam] + length_local, comp);
#if _GLIBCXX_ASSERTIONS
_GLIBCXX_PARALLEL_ASSERT(is_sorted(sd->sorting_places[iam], sd->sorting_places[iam] + length_local, comp));
#endif
__gnu_sequential::sort(sd->sorting_places[iam],
sd->sorting_places[iam] + length_local,
comp);
// Invariant: locally sorted subsequence in sd->sorting_places[iam],
// sd->sorting_places[iam] + length_local.
......@@ -196,22 +196,23 @@ namespace __gnu_parallel
if (Settings::sort_splitting == Settings::SAMPLING)
{
difference_type num_samples;
determine_samples(d, num_samples);
determine_samples(sd, num_samples);
#pragma omp barrier
# pragma omp barrier
#pragma omp single
# pragma omp single
__gnu_sequential::sort(sd->samples,
sd->samples + (num_samples * d->num_threads),
sd->samples + (num_samples * sd->num_threads),
comp);
#pragma omp barrier
# pragma omp barrier
for (int s = 0; s < d->num_threads; s++)
for (int s = 0; s < sd->num_threads; s++)
{
// For each sequence.
if (num_samples * iam > 0)
sd->pieces[iam][s].begin = std::lower_bound(sd->sorting_places[s],
sd->pieces[iam][s].begin =
std::lower_bound(sd->sorting_places[s],
sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s],
sd->samples[num_samples * iam],
comp)
......@@ -220,43 +221,47 @@ namespace __gnu_parallel
// Absolute beginning.
sd->pieces[iam][s].begin = 0;
if ((num_samples * (iam + 1)) < (num_samples * d->num_threads))
sd->pieces[iam][s].end = std::lower_bound(sd->sorting_places[s],
sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s], sd->samples[num_samples * (iam + 1)], comp)
if ((num_samples * (iam + 1)) < (num_samples * sd->num_threads))
sd->pieces[iam][s].end =
std::lower_bound(sd->sorting_places[s],
sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s],
sd->samples[num_samples * (iam + 1)], comp)
- sd->sorting_places[s];
else
// Absolute end.
sd->pieces[iam][s].end = sd->starts[s + 1] - sd->starts[s];
}
}
else if (Settings::sort_splitting == Settings::EXACT)
{
#pragma omp barrier
# pragma omp barrier
std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> > seqs(d->num_threads);
for (int s = 0; s < d->num_threads; s++)
seqs[s] = std::make_pair(sd->sorting_places[s], sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s]);
std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> >
seqs(sd->num_threads);
for (int s = 0; s < sd->num_threads; s++)
seqs[s] = std::make_pair(sd->sorting_places[s],
sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s]);
std::vector<SortingPlacesIterator> offsets(d->num_threads);
std::vector<SortingPlacesIterator> offsets(sd->num_threads);
// If not last thread.
if (iam < d->num_threads - 1)
multiseq_partition(seqs.begin(), seqs.end(), sd->starts[iam + 1], offsets.begin(), comp);
// if not last thread
if (iam < sd->num_threads - 1)
multiseq_partition(seqs.begin(), seqs.end(),
sd->starts[iam + 1], offsets.begin(), comp);
for (int seq = 0; seq < d->num_threads; seq++)
for (int seq = 0; seq < sd->num_threads; seq++)
{
// For each sequence.
if (iam < (d->num_threads - 1))
// for each sequence
if (iam < (sd->num_threads - 1))
sd->pieces[iam][seq].end = offsets[seq] - seqs[seq].first;
else
// Absolute end of this sequence.
// very end of this sequence
sd->pieces[iam][seq].end = sd->starts[seq + 1] - sd->starts[seq];
}
#pragma omp barrier
# pragma omp barrier
for (int seq = 0; seq < d->num_threads; seq++)
for (int seq = 0; seq < sd->num_threads; seq++)
{
// For each sequence.
if (iam > 0)
......@@ -269,7 +274,7 @@ namespace __gnu_parallel
// Offset from target begin, length after merging.
difference_type offset = 0, length_am = 0;
for (int s = 0; s < d->num_threads; s++)
for (int s = 0; s < sd->num_threads; s++)
{
length_am += sd->pieces[iam][s].end - sd->pieces[iam][s].begin;
offset += sd->pieces[iam][s].begin;
......@@ -279,40 +284,37 @@ namespace __gnu_parallel
// Merge to temporary storage, uninitialized creation not possible
// since there is no multiway_merge calling the placement new
// instead of the assignment operator.
sd->merging_places[iam] = sd->temporaries[iam] = static_cast<value_type*>(::operator new(sizeof(value_type) * length_am));
sd->merging_places[iam] = sd->temporaries[iam] =
static_cast<value_type*>(
::operator new(sizeof(value_type) * length_am));
#else
// Merge directly to target.
sd->merging_places[iam] = sd->source + offset;
#endif
std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> > seqs(d->num_threads);
std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> >
seqs(sd->num_threads);
for (int s = 0; s < d->num_threads; s++)
for (int s = 0; s < sd->num_threads; s++)
{
seqs[s] = std::make_pair(sd->sorting_places[s] + sd->pieces[iam][s].begin, sd->sorting_places[s] + sd->pieces[iam][s].end);
#if _GLIBCXX_ASSERTIONS
_GLIBCXX_PARALLEL_ASSERT(is_sorted(seqs[s].first, seqs[s].second, comp));
#endif
seqs[s] = std::make_pair(sd->sorting_places[s] + sd->pieces[iam][s].begin,
sd->sorting_places[s] + sd->pieces[iam][s].end);
}
multiway_merge(seqs.begin(), seqs.end(), sd->merging_places[iam], comp, length_am, d->stable, false, sequential_tag());
#if _GLIBCXX_ASSERTIONS
_GLIBCXX_PARALLEL_ASSERT(is_sorted(sd->merging_places[iam], sd->merging_places[iam] + length_am, comp));
#endif
multiway_merge(seqs.begin(), seqs.end(), sd->merging_places[iam], comp, length_am, sd->stable, false, sequential_tag());
# pragma omp barrier
#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
// Write back.
std::copy(sd->merging_places[iam], sd->merging_places[iam] + length_am,
std::copy(sd->merging_places[iam],
sd->merging_places[iam] + length_am,
sd->source + offset);
#endif
delete[] sd->temporaries[iam];
}
/** @brief PMWMS main call.
/** @brief PMWMS main call.
* @param begin Begin iterator of sequence.
* @param end End iterator of sequence.
* @param comp Comparator.
......@@ -320,12 +322,13 @@ namespace __gnu_parallel
* @param num_threads Number of threads to use.
* @param stable Stable sorting.
*/
template<typename RandomAccessIterator, typename Comparator>
template<typename RandomAccessIterator, typename Comparator>
inline void
parallel_sort_mwms(RandomAccessIterator begin, RandomAccessIterator end,
Comparator comp,
typename std::iterator_traits<RandomAccessIterator>::difference_type n,
int num_threads, bool stable)
int num_threads,
bool stable)
{
_GLIBCXX_CALL(n)
......@@ -336,12 +339,21 @@ namespace __gnu_parallel
if (n <= 1)
return;
// At least one element per thread.
// at least one element per thread
if (num_threads > n)
num_threads = static_cast<thread_index_t>(n);
// shared variables
PMWMSSortingData<RandomAccessIterator> sd;
difference_type* starts;
# pragma omp parallel num_threads(num_threads)
{
num_threads = omp_get_num_threads(); //no more threads than requested
# pragma omp single
{
sd.num_threads = num_threads;
sd.source = begin;
sd.temporaries = new value_type*[num_threads];
......@@ -355,12 +367,10 @@ namespace __gnu_parallel
if (Settings::sort_splitting == Settings::SAMPLING)
{
unsigned int sz = Settings::sort_mwms_oversampling * num_threads - 1;
sz *= num_threads;
// Equivalent to value_type[sz], without need of default construction.
sz *= sizeof(value_type);
sd.samples = static_cast<value_type*>(::operator new(sz));
unsigned int size =
(Settings::sort_mwms_oversampling * num_threads - 1) * num_threads;
sd.samples = static_cast<value_type*>(
::operator new(size * sizeof(value_type)));
}
else
sd.samples = NULL;
......@@ -369,28 +379,24 @@ namespace __gnu_parallel
sd.pieces = new std::vector<Piece<difference_type> >[num_threads];
for (int s = 0; s < num_threads; s++)
sd.pieces[s].resize(num_threads);
PMWMSSorterPU<RandomAccessIterator>* pus = new PMWMSSorterPU<RandomAccessIterator>[num_threads];
difference_type* starts = sd.starts = new difference_type[num_threads + 1];
starts = sd.starts = new difference_type[num_threads + 1];
sd.stable = stable;
difference_type chunk_length = n / num_threads;
difference_type split = n % num_threads;
difference_type start = 0;
difference_type pos = 0;
for (int i = 0; i < num_threads; i++)
{
starts[i] = start;
start += (i < split) ? (chunk_length + 1) : chunk_length;
pus[i].num_threads = num_threads;
pus[i].iam = i;
pus[i].sd = &sd;
pus[i].stable = stable;
starts[i] = pos;
pos += (i < split) ? (chunk_length + 1) : chunk_length;
}
starts[num_threads] = pos;
}
starts[num_threads] = start;
// Now sort in parallel.
#pragma omp parallel num_threads(num_threads)
parallel_sort_mwms_pu(&(pus[omp_get_thread_num()]), comp);
parallel_sort_mwms_pu(&sd, comp);
} //parallel
// XXX sd as RAII
delete[] starts;
delete[] sd.temporaries;
delete[] sd.sorting_places;
......@@ -401,10 +407,7 @@ namespace __gnu_parallel
delete[] sd.offsets;
delete[] sd.pieces;
delete[] pus;
}
}
} //namespace __gnu_parallel
#endif
......@@ -43,10 +43,11 @@
#include <parallel/settings.h>
#include <parallel/basic_iterator.h>
#include <parallel/base.h>
namespace __gnu_parallel
{
/** @brief Embarrassingly parallel algorithm for random access
/** @brief Embarrassingly parallel algorithm for random access
* iterators, using an OpenMP for loop.
*
* @param begin Begin iterator of element sequence.
......@@ -63,34 +64,50 @@ namespace __gnu_parallel
* std::count_n()).
* @return User-supplied functor (that may contain a part of the result).
*/
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
template<typename RandomAccessIterator,
typename Op,
typename Fu,
typename Red,
typename Result>
Op
for_each_template_random_access_omp_loop(RandomAccessIterator begin, RandomAccessIterator end, Op o, Fu& f, Red r, Result base, Result& output, typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
for_each_template_random_access_omp_loop(
RandomAccessIterator begin,
RandomAccessIterator end,
Op o, Fu& f, Red r, Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::
difference_type bound)
{
typedef typename std::iterator_traits<RandomAccessIterator>::difference_type difference_type;
typedef typename
std::iterator_traits<RandomAccessIterator>::difference_type
difference_type;
thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : static_cast<thread_index_t>((end - begin));
Result *thread_results = new Result[num_threads];
difference_type length = end - begin;
thread_index_t num_threads =
__gnu_parallel::min<difference_type>(get_max_threads(), length);
for (thread_index_t i = 0; i < num_threads; i++)
{
thread_results[i] = r(thread_results[i], f(o, begin+i));
}
Result *thread_results;
#pragma omp parallel num_threads(num_threads)
# pragma omp parallel num_threads(num_threads)
{
#pragma omp for schedule(dynamic, Settings::workstealing_chunk_size)
for (difference_type pos = 0; pos < length; pos++)
# pragma omp single
{
thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos));
}
num_threads = omp_get_num_threads();
thread_results = new Result[num_threads];
for (thread_index_t i = 0; i < num_threads; i++)
thread_results[i] = Result();
}
thread_index_t iam = omp_get_thread_num();
# pragma omp for schedule(dynamic, Settings::workstealing_chunk_size)
for (difference_type pos = 0; pos < length; pos++)
thread_results[iam] =
r(thread_results[iam], f(o, begin+pos));
} //parallel
for (thread_index_t i = 0; i < num_threads; i++)
{
output = r(output, thread_results[i]);
}
delete [] thread_results;
......@@ -100,6 +117,7 @@ namespace __gnu_parallel
return o;
}
} // end namespace
#endif
......@@ -64,39 +64,50 @@ namespace __gnu_parallel
* std::count_n()).
* @return User-supplied functor (that may contain a part of the result).
*/
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
template<typename RandomAccessIterator,
typename Op,
typename Fu,
typename Red,
typename Result>
Op
for_each_template_random_access_omp_loop_static(RandomAccessIterator begin,
for_each_template_random_access_omp_loop_static(
RandomAccessIterator begin,
RandomAccessIterator end,
Op o, Fu& f, Red r,
Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
Op o, Fu& f, Red r, Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::
difference_type bound)
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::difference_type difference_type;
typedef typename
std::iterator_traits<RandomAccessIterator>::difference_type
difference_type;
thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : (end - begin);
Result *thread_results = new Result[num_threads];
difference_type length = end - begin;
thread_index_t num_threads =
std::min<difference_type>(get_max_threads(), length);
for (thread_index_t i = 0; i < num_threads; i++)
{
thread_results[i] = r(thread_results[i], f(o, begin+i));
}
Result *thread_results;
#pragma omp parallel num_threads(num_threads)
# pragma omp parallel num_threads(num_threads)
{
#pragma omp for schedule(static, Settings::workstealing_chunk_size)
for (difference_type pos = 0; pos < length; pos++)
# pragma omp single
{
thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos));
}
num_threads = omp_get_num_threads();
thread_results = new Result[num_threads];
for (thread_index_t i = 0; i < num_threads; i++)
thread_results[i] = Result();
}
thread_index_t iam = omp_get_thread_num();
# pragma omp for schedule(static, Settings::workstealing_chunk_size)
for (difference_type pos = 0; pos < length; pos++)
thread_results[iam] =
r(thread_results[iam], f(o, begin+pos));
} //parallel
for (thread_index_t i = 0; i < num_threads; i++)
{
output = r(output, thread_results[i]);
}
delete [] thread_results;
......@@ -106,6 +117,7 @@ namespace __gnu_parallel
return o;
}
} // end namespace
#endif
......@@ -41,11 +41,12 @@
#include <omp.h>
#include <parallel/settings.h>
#include <parallel/base.h>
namespace __gnu_parallel
{
/** @brief Embarrassingly parallel algorithm for random access
/** @brief Embarrassingly parallel algorithm for random access
* iterators, using hand-crafted parallelization by equal splitting
* the work.
*
......@@ -63,47 +64,57 @@ namespace __gnu_parallel
* std::count_n()).
* @return User-supplied functor (that may contain a part of the result).
*/
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
template<
typename RandomAccessIterator,
typename Op,
typename Fu,
typename Red,
typename Result>
Op
for_each_template_random_access_ed(RandomAccessIterator begin,
RandomAccessIterator end, Op o, Fu& f,
Red r, Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
for_each_template_random_access_ed(
RandomAccessIterator begin,
RandomAccessIterator end,
Op o, Fu& f, Red r, Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::
difference_type bound)
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::difference_type difference_type;
const difference_type length = end - begin;
const difference_type settings_threads = static_cast<difference_type>(get_max_threads());
const difference_type dmin = settings_threads < length ? settings_threads : length;
const difference_type dmax = dmin > 1 ? dmin : 1;
Result *thread_results;
thread_index_t num_threads = static_cast<thread_index_t>(dmax);
thread_index_t num_threads =
__gnu_parallel::min<difference_type>(get_max_threads(), length);
# pragma omp parallel num_threads(num_threads)
{
# pragma omp single
{
num_threads = omp_get_num_threads();
thread_results = new Result[num_threads];
}
Result *thread_results = new Result[num_threads];
thread_index_t iam = omp_get_thread_num();
#pragma omp parallel num_threads(num_threads)
{
// Neutral element.
Result reduct = Result();
thread_index_t p = num_threads;
thread_index_t iam = omp_get_thread_num();
difference_type start = iam * length / p;
difference_type limit = (iam == p - 1) ? length : (iam + 1) * length / p;
difference_type
start = equally_split_point(length, num_threads, iam),
stop = equally_split_point(length, num_threads, iam + 1);
if (start < limit)
if (start < stop)
{
reduct = f(o, begin + start);
start++;
++start;
}
for (; start < limit; start++)
for (; start < stop; ++start)
reduct = r(reduct, f(o, begin + start));
thread_results[iam] = reduct;
}
} //parallel
for (thread_index_t i = 0; i < num_threads; i++)
output = r(output, thread_results[i]);
......
......@@ -48,7 +48,7 @@ namespace __gnu_parallel
{
// Problem: there is no 0-element given.
/** @brief Base case prefix sum routine.
/** @brief Base case prefix sum routine.
* @param begin Begin iterator of input sequence.
* @param end End iterator of input sequence.
* @param result Begin iterator of output sequence.
......@@ -56,9 +56,13 @@ namespace __gnu_parallel
* @param value Start value. Must be passed since the neutral
* element is unknown in general.
* @return End iterator of output sequence. */
template<typename InputIterator, typename OutputIterator, typename BinaryOperation>
template<
typename InputIterator,
typename OutputIterator,
typename BinaryOperation>
inline OutputIterator
parallel_partial_sum_basecase(InputIterator begin, InputIterator end,
parallel_partial_sum_basecase(
InputIterator begin, InputIterator end,
OutputIterator result, BinaryOperation bin_op,
typename std::iterator_traits<InputIterator>::value_type value)
{
......@@ -75,7 +79,7 @@ namespace __gnu_parallel
return result;
}
/** @brief Parallel partial sum implementation, two-phase approach,
/** @brief Parallel partial sum implementation, two-phase approach,
no recursion.
* @param begin Begin iterator of input sequence.
* @param end End iterator of input sequence.
......@@ -85,31 +89,49 @@ namespace __gnu_parallel
* @param num_threads Number of threads to use.
* @return End iterator of output sequence.
*/
template<typename InputIterator, typename OutputIterator, typename BinaryOperation>
template<
typename InputIterator,
typename OutputIterator,
typename BinaryOperation>
OutputIterator
parallel_partial_sum_linear(InputIterator begin, InputIterator end,
parallel_partial_sum_linear(
InputIterator begin, InputIterator end,
OutputIterator result, BinaryOperation bin_op,
typename std::iterator_traits<InputIterator>::difference_type n, int num_threads)
typename std::iterator_traits<InputIterator>::difference_type n)
{
typedef std::iterator_traits<InputIterator> traits_type;
typedef typename traits_type::value_type value_type;
typedef typename traits_type::difference_type difference_type;
if (num_threads > (n - 1))
num_threads = static_cast<thread_index_t>(n - 1);
thread_index_t num_threads =
std::min<difference_type>(get_max_threads(), n - 1);
if (num_threads < 2)
{
*result = *begin;
return parallel_partial_sum_basecase(begin + 1, end, result + 1, bin_op, *begin);
return parallel_partial_sum_basecase(
begin + 1, end, result + 1, bin_op, *begin);
}
difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 2)));
difference_type* borders;
value_type* sums;
# pragma omp parallel num_threads(num_threads)
{
# pragma omp single
{
num_threads = omp_get_num_threads();
borders = new difference_type[num_threads + 2];
if (Settings::partial_sum_dilatation == 1.0f)
equally_split(n, num_threads + 1, borders);
else
{
difference_type chunk_length = (int)((double)n / ((double)num_threads + Settings::partial_sum_dilatation)), borderstart = n - num_threads * chunk_length;
difference_type chunk_length =
((double)n /
((double)num_threads + Settings::partial_sum_dilatation)),
borderstart = n - num_threads * chunk_length;
borders[0] = 0;
for (int i = 1; i < (num_threads + 1); i++)
{
......@@ -119,13 +141,13 @@ namespace __gnu_parallel
borders[num_threads + 1] = n;
}
value_type* sums = static_cast<value_type*>(::operator new(sizeof(value_type) * num_threads));
sums = static_cast<value_type*>(
::operator new(sizeof(value_type) * num_threads));
OutputIterator target_end;
} //single
#pragma omp parallel num_threads(num_threads)
{
int id = omp_get_thread_num();
if (id == 0)
int iam = omp_get_thread_num();
if (iam == 0)
{
*result = *begin;
parallel_partial_sum_basecase(begin + 1, begin + borders[1],
......@@ -134,44 +156,48 @@ namespace __gnu_parallel
}
else
{
sums[id] = std::accumulate(begin + borders[id] + 1,
begin + borders[id + 1],
*(begin + borders[id]),
sums[iam] = std::accumulate(begin + borders[iam] + 1,
begin + borders[iam + 1],
*(begin + borders[iam]),
bin_op, __gnu_parallel::sequential_tag());
}
#pragma omp barrier
# pragma omp barrier
#pragma omp single
parallel_partial_sum_basecase(sums + 1, sums + num_threads, sums + 1,
bin_op, sums[0]);
# pragma omp single
parallel_partial_sum_basecase(
sums + 1, sums + num_threads, sums + 1, bin_op, sums[0]);
#pragma omp barrier
# pragma omp barrier
// Still same team.
parallel_partial_sum_basecase(begin + borders[id + 1],
begin + borders[id + 2],
result + borders[id + 1], bin_op,
sums[id]);
}
parallel_partial_sum_basecase(begin + borders[iam + 1],
begin + borders[iam + 2],
result + borders[iam + 1], bin_op,
sums[iam]);
} //parallel
delete [] sums;
delete[] sums;
delete[] borders;
return result + n;
}
/** @brief Parallel partial sum front-end.
/** @brief Parallel partial sum front-end.
* @param begin Begin iterator of input sequence.
* @param end End iterator of input sequence.
* @param result Begin iterator of output sequence.
* @param bin_op Associative binary function.
* @return End iterator of output sequence. */
template<typename InputIterator, typename OutputIterator, typename BinaryOperation>
template<
typename InputIterator,
typename OutputIterator,
typename BinaryOperation>
OutputIterator
parallel_partial_sum(InputIterator begin, InputIterator end,
OutputIterator result, BinaryOperation bin_op)
{
_GLIBCXX_CALL(begin - end);
_GLIBCXX_CALL(begin - end)
typedef std::iterator_traits<InputIterator> traits_type;
typedef typename traits_type::value_type value_type;
......@@ -179,14 +205,11 @@ namespace __gnu_parallel
difference_type n = end - begin;
int num_threads = get_max_threads();
switch (Settings::partial_sum_algorithm)
{
case Settings::LINEAR:
// Need an initial offset.
return parallel_partial_sum_linear(begin, end, result, bin_op,
n, num_threads);
return parallel_partial_sum_linear(begin, end, result, bin_op, n);
default:
// Partial_sum algorithm not implemented.
_GLIBCXX_PARALLEL_ASSERT(0);
......
......@@ -45,21 +45,21 @@
#include <bits/stl_algo.h>
#include <parallel/parallel.h>
/** @brief Decide whether to declare certain variable volatile in this file. */
/** @brief Decide whether to declare certain variables volatile. */
#define _GLIBCXX_VOLATILE volatile
namespace __gnu_parallel
{
/** @brief Parallel implementation of std::partition.
/** @brief Parallel implementation of std::partition.
* @param begin Begin iterator of input sequence to split.
* @param end End iterator of input sequence to split.
* @param pred Partition predicate, possibly including some kind of pivot.
* @param max_num_threads Maximum number of threads to use for this task.
* @param num_threads Maximum number of threads to use for this task.
* @return Number of elements not fulfilling the predicate. */
template<typename RandomAccessIterator, typename Predicate>
inline typename std::iterator_traits<RandomAccessIterator>::difference_type
template<typename RandomAccessIterator, typename Predicate>
typename std::iterator_traits<RandomAccessIterator>::difference_type
parallel_partition(RandomAccessIterator begin, RandomAccessIterator end,
Predicate pred, thread_index_t max_num_threads)
Predicate pred, thread_index_t num_threads)
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type;
......@@ -74,25 +74,37 @@ namespace __gnu_parallel
_GLIBCXX_VOLATILE difference_type leftover_left, leftover_right;
_GLIBCXX_VOLATILE difference_type leftnew, rightnew;
bool* reserved_left, * reserved_right;
reserved_left = new bool[max_num_threads];
reserved_right = new bool[max_num_threads];
bool* reserved_left = NULL, * reserved_right = NULL;
difference_type chunk_size;
if (Settings::partition_chunk_share > 0.0)
chunk_size = std::max((difference_type)Settings::partition_chunk_size, (difference_type)((double)n * Settings::partition_chunk_share / (double)max_num_threads));
else
chunk_size = Settings::partition_chunk_size;
omp_lock_t result_lock;
omp_init_lock(&result_lock);
// At least good for two processors.
while (right - left + 1 >= 2 * max_num_threads * chunk_size)
//at least two chunks per thread
if(right - left + 1 >= 2 * num_threads * chunk_size)
# pragma omp parallel num_threads(num_threads)
{
# pragma omp single
{
num_threads = omp_get_num_threads();
reserved_left = new bool[num_threads];
reserved_right = new bool[num_threads];
if (Settings::partition_chunk_share > 0.0)
chunk_size = std::max<difference_type>(
Settings::partition_chunk_size,
(double)n * Settings::partition_chunk_share /
(double)num_threads);
else
chunk_size = Settings::partition_chunk_size;
}
while (right - left + 1 >= 2 * num_threads * chunk_size)
{
# pragma omp single
{
difference_type num_chunks = (right - left + 1) / chunk_size;
thread_index_t num_threads = (int)std::min((difference_type)max_num_threads, num_chunks / 2);
for (int r = 0; r < num_threads; r++)
{
......@@ -101,11 +113,11 @@ namespace __gnu_parallel
}
leftover_left = 0;
leftover_right = 0;
} //implicit barrier
#pragma omp parallel num_threads(num_threads)
{
// Private.
difference_type thread_left, thread_left_border, thread_right, thread_right_border;
difference_type thread_left, thread_left_border,
thread_right, thread_right_border;
thread_left = left + 1;
// Just to satisfy the condition below.
......@@ -150,12 +162,15 @@ namespace __gnu_parallel
// Swap as usual.
while (thread_left < thread_right)
{
while (pred(begin[thread_left]) && thread_left <= thread_left_border)
while (pred(begin[thread_left])
&& thread_left <= thread_left_border)
thread_left++;
while (!pred(begin[thread_right]) && thread_right >= thread_right_border)
while (!pred(begin[thread_right])
&& thread_right >= thread_right_border)
thread_right--;
if (thread_left > thread_left_border || thread_right < thread_right_border)
if (thread_left > thread_left_border
|| thread_right < thread_right_border)
// Fetch new chunk(s).
break;
......@@ -167,28 +182,29 @@ namespace __gnu_parallel
// Now swap the leftover chunks to the right places.
if (thread_left <= thread_left_border)
#pragma omp atomic
# pragma omp atomic
leftover_left++;
if (thread_right >= thread_right_border)
#pragma omp atomic
# pragma omp atomic
leftover_right++;
#pragma omp barrier
# pragma omp barrier
#pragma omp single
# pragma omp single
{
leftnew = left - leftover_left * chunk_size;
rightnew = right + leftover_right * chunk_size;
}
#pragma omp barrier
# pragma omp barrier
// <=> thread_left_border + (chunk_size - 1) >= leftnew
if (thread_left <= thread_left_border
&& thread_left_border >= leftnew)
{
// Chunk already in place, reserve spot.
reserved_left[(left - (thread_left_border + 1)) / chunk_size] = true;
reserved_left[(left - (thread_left_border + 1)) / chunk_size]
= true;
}
// <=> thread_right_border - (chunk_size - 1) <= rightnew
......@@ -196,12 +212,15 @@ namespace __gnu_parallel
&& thread_right_border <= rightnew)
{
// Chunk already in place, reserve spot.
reserved_right[((thread_right_border - 1) - right) / chunk_size] = true;
reserved_right
[((thread_right_border - 1) - right) / chunk_size]
= true;
}
#pragma omp barrier
# pragma omp barrier
if (thread_left <= thread_left_border && thread_left_border < leftnew)
if (thread_left <= thread_left_border
&& thread_left_border < leftnew)
{
// Find spot and swap.
difference_type swapstart = -1;
......@@ -219,7 +238,10 @@ namespace __gnu_parallel
_GLIBCXX_PARALLEL_ASSERT(swapstart != -1);
#endif
std::swap_ranges(begin + thread_left_border - (chunk_size - 1), begin + thread_left_border + 1, begin + swapstart);
std::swap_ranges(
begin + thread_left_border - (chunk_size - 1),
begin + thread_left_border + 1,
begin + swapstart);
}
if (thread_right >= thread_right_border
......@@ -241,12 +263,14 @@ namespace __gnu_parallel
_GLIBCXX_PARALLEL_ASSERT(swapstart != -1);
#endif
std::swap_ranges(begin + thread_right_border, begin + thread_right_border + chunk_size, begin + swapstart);
std::swap_ranges(begin + thread_right_border,
begin + thread_right_border + chunk_size,
begin + swapstart);
}
#if _GLIBCXX_ASSERTIONS
#pragma omp barrier
# pragma omp barrier
#pragma omp single
# pragma omp single
{
for (int r = 0; r < leftover_left; r++)
_GLIBCXX_PARALLEL_ASSERT(reserved_left[r]);
......@@ -254,14 +278,16 @@ namespace __gnu_parallel
_GLIBCXX_PARALLEL_ASSERT(reserved_right[r]);
}
#pragma omp barrier
# pragma omp barrier
#endif
#pragma omp barrier
# pragma omp barrier
left = leftnew;
right = rightnew;
}
} // end "recursion"
# pragma omp flush(left, right)
} // end "recursion" //parallel
difference_type final_left = left, final_right = right;
......@@ -298,14 +324,14 @@ namespace __gnu_parallel
return final_left + 1;
}
/**
/**
* @brief Parallel implementation of std::nth_element().
* @param begin Begin iterator of input sequence.
* @param nth Iterator of element that must be in position afterwards.
* @param end End iterator of input sequence.
* @param comp Comparator.
*/
template<typename RandomAccessIterator, typename Comparator>
template<typename RandomAccessIterator, typename Comparator>
void
parallel_nth_element(RandomAccessIterator begin, RandomAccessIterator nth,
RandomAccessIterator end, Comparator comp)
......@@ -377,12 +403,12 @@ namespace __gnu_parallel
__gnu_sequential::sort(begin, end, comp);
}
/** @brief Parallel implementation of std::partial_sort().
* @param begin Begin iterator of input sequence.
* @param middle Sort until this position.
* @param end End iterator of input sequence.
* @param comp Comparator. */
template<typename RandomAccessIterator, typename Comparator>
/** @brief Parallel implementation of std::partial_sort().
* @param begin Begin iterator of input sequence.
* @param middle Sort until this position.
* @param end End iterator of input sequence.
* @param comp Comparator. */
template<typename RandomAccessIterator, typename Comparator>
void
parallel_partial_sort(RandomAccessIterator begin, RandomAccessIterator middle, RandomAccessIterator end, Comparator comp)
{
......
......@@ -53,11 +53,17 @@ namespace __gnu_parallel
* this part.
*/
template<typename RandomAccessIterator, typename Comparator>
inline typename std::iterator_traits<RandomAccessIterator>::difference_type
parallel_sort_qs_divide(RandomAccessIterator begin, RandomAccessIterator end,
inline
typename std::iterator_traits<RandomAccessIterator>::difference_type
parallel_sort_qs_divide(
RandomAccessIterator begin,
RandomAccessIterator end,
Comparator comp,
typename std::iterator_traits<RandomAccessIterator>::difference_type pivot_rank,
typename std::iterator_traits<RandomAccessIterator>::difference_type num_samples, thread_index_t num_threads)
typename std::iterator_traits<RandomAccessIterator>::difference_type
pivot_rank,
typename std::iterator_traits<RandomAccessIterator>::difference_type
num_samples,
thread_index_t num_threads)
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type;
......@@ -65,20 +71,24 @@ namespace __gnu_parallel
difference_type n = end - begin;
num_samples = std::min(num_samples, n);
value_type* samples = static_cast<value_type*>(__builtin_alloca(sizeof(value_type) * num_samples));
// Allocate uninitialized, to avoid default constructor.
value_type* samples = static_cast<value_type*>(
operator new(num_samples * sizeof(value_type)));
for (difference_type s = 0; s < num_samples; s++)
{
const unsigned long long index = static_cast<unsigned long long>(s)
* n / num_samples;
samples[s] = begin[index];
new(samples + s) value_type(begin[index]);
}
__gnu_sequential::sort(samples, samples + num_samples, comp);
value_type& pivot = samples[pivot_rank * num_samples / n];
__gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, pivot);
__gnu_parallel::binder2nd<Comparator, value_type, value_type, bool>
pred(comp, pivot);
difference_type split = parallel_partition(begin, end, pred, num_threads);
return split;
......@@ -93,7 +103,10 @@ namespace __gnu_parallel
*/
template<typename RandomAccessIterator, typename Comparator>
inline void
parallel_sort_qs_conquer(RandomAccessIterator begin, RandomAccessIterator end, Comparator comp, int num_threads)
parallel_sort_qs_conquer(RandomAccessIterator begin,
RandomAccessIterator end,
Comparator comp,
thread_index_t num_threads)
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type;
......@@ -110,24 +123,27 @@ namespace __gnu_parallel
if (n <= 1)
return;
thread_index_t num_processors_left;
thread_index_t num_threads_left;
if ((num_threads % 2) == 1)
num_processors_left = num_threads / 2 + 1;
num_threads_left = num_threads / 2 + 1;
else
num_processors_left = num_threads / 2;
num_threads_left = num_threads / 2;
pivot_rank = n * num_processors_left / num_threads;
pivot_rank = n * num_threads_left / num_threads;
difference_type split = parallel_sort_qs_divide(begin, end, comp, pivot_rank,
Settings::sort_qs_num_samples_preset, num_threads);
difference_type split = parallel_sort_qs_divide(
begin, end, comp, pivot_rank,
Settings::sort_qs_num_samples_preset, num_threads);
#pragma omp parallel sections
{
#pragma omp section
parallel_sort_qs_conquer(begin, begin + split, comp, num_processors_left);
parallel_sort_qs_conquer(begin, begin + split,
comp, num_threads_left);
#pragma omp section
parallel_sort_qs_conquer(begin + split, end, comp, num_threads - num_processors_left);
parallel_sort_qs_conquer(begin + split, end,
comp, num_threads - num_threads_left);
}
}
......@@ -143,9 +159,12 @@ Settings::sort_qs_num_samples_preset, num_threads);
*/
template<typename RandomAccessIterator, typename Comparator>
inline void
parallel_sort_qs(RandomAccessIterator begin, RandomAccessIterator end,
parallel_sort_qs(
RandomAccessIterator begin,
RandomAccessIterator end,
Comparator comp,
typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads)
typename std::iterator_traits<RandomAccessIterator>::difference_type n,
int num_threads)
{
_GLIBCXX_CALL(n)
......@@ -165,10 +184,7 @@ Settings::sort_qs_num_samples_preset, num_threads);
// Hard to avoid.
omp_set_num_threads(num_threads);
bool old_nested = (omp_get_nested() != 0);
omp_set_nested(true);
parallel_sort_qs_conquer(begin, begin + n, comp, num_threads);
omp_set_nested(old_nested);
}
} //namespace __gnu_parallel
......
......@@ -45,16 +45,16 @@
namespace __gnu_parallel
{
/** @brief Type to hold the index of a bin.
/** @brief Type to hold the index of a bin.
*
* Since many variables of this type are allocated, it should be
* chosen as small as possible.
*/
typedef unsigned short bin_index;
typedef unsigned short bin_index;
/** @brief Data known to every thread participating in
/** @brief Data known to every thread participating in
__gnu_parallel::parallel_random_shuffle(). */
template<typename RandomAccessIterator>
template<typename RandomAccessIterator>
struct DRandomShufflingGlobalData
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
......@@ -90,18 +90,15 @@ namespace __gnu_parallel
: source(_source) { }
};
/** @brief Local data for a thread participating in
/** @brief Local data for a thread participating in
__gnu_parallel::parallel_random_shuffle().
*/
template<typename RandomAccessIterator, typename RandomNumberGenerator>
template<typename RandomAccessIterator, typename RandomNumberGenerator>
struct DRSSorterPU
{
/** @brief Number of threads participating in total. */
int num_threads;
/** @brief Number of owning thread. */
int iam;
/** @brief Begin index for bins taken care of by this thread. */
bin_index bins_begin;
......@@ -115,18 +112,18 @@ namespace __gnu_parallel
DRandomShufflingGlobalData<RandomAccessIterator>* sd;
};
/** @brief Generate a random number in @c [0,2^logp).
/** @brief Generate a random number in @c [0,2^logp).
* @param logp Logarithm (basis 2) of the upper range bound.
* @param rng Random number generator to use.
*/
template<typename RandomNumberGenerator>
template<typename RandomNumberGenerator>
inline int
random_number_pow2(int logp, RandomNumberGenerator& rng)
{ return rng.genrand_bits(logp); }
/** @brief Random shuffle code executed by each thread.
/** @brief Random shuffle code executed by each thread.
* @param pus Array of thread-local data records. */
template<typename RandomAccessIterator, typename RandomNumberGenerator>
template<typename RandomAccessIterator, typename RandomNumberGenerator>
inline void
parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator,
RandomNumberGenerator>* pus)
......@@ -135,9 +132,9 @@ namespace __gnu_parallel
typedef typename traits_type::value_type value_type;
typedef typename traits_type::difference_type difference_type;
DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[omp_get_thread_num()];
thread_index_t iam = omp_get_thread_num();
DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[iam];
DRandomShufflingGlobalData<RandomAccessIterator>* sd = d->sd;
thread_index_t iam = d->iam;
// Indexing: dist[bin][processor]
difference_type length = sd->starts[iam + 1] - sd->starts[iam];
......@@ -166,9 +163,9 @@ namespace __gnu_parallel
for (bin_index b = 0; b < sd->num_bins + 1; b++)
sd->dist[b][iam + 1] = dist[b];
#pragma omp barrier
# pragma omp barrier
#pragma omp single
# pragma omp single
{
// Sum up bins, sd->dist[s + 1][d->num_threads] now contains the
// total number of items in bin s
......@@ -178,13 +175,13 @@ namespace __gnu_parallel
sd->dist[s + 1]);
}
#pragma omp barrier
# pragma omp barrier
sequence_index_t offset = 0, global_offset = 0;
for (bin_index s = 0; s < d->bins_begin; s++)
global_offset += sd->dist[s + 1][d->num_threads];
#pragma omp barrier
# pragma omp barrier
for (bin_index s = d->bins_begin; s < d->bins_end; s++)
{
......@@ -193,9 +190,10 @@ namespace __gnu_parallel
offset = sd->dist[s + 1][d->num_threads];
}
sd->temporaries[iam] = static_cast<value_type*>(::operator new(sizeof(value_type) * offset));
sd->temporaries[iam] = static_cast<value_type*>(
::operator new(sizeof(value_type) * offset));
#pragma omp barrier
# pragma omp barrier
// Draw local copies to avoid false sharing.
for (bin_index b = 0; b < sd->num_bins + 1; b++)
......@@ -223,23 +221,27 @@ namespace __gnu_parallel
delete[] bin_proc;
delete[] temporaries;
#pragma omp barrier
# pragma omp barrier
// Shuffle bins internally.
for (bin_index b = d->bins_begin; b < d->bins_end; b++)
{
value_type* begin = sd->temporaries[iam] + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]),
* end = sd->temporaries[iam] + sd->dist[b + 1][d->num_threads];
value_type* begin =
sd->temporaries[iam] +
((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]),
* end =
sd->temporaries[iam] + sd->dist[b + 1][d->num_threads];
sequential_random_shuffle(begin, end, rng);
std::copy(begin, end, sd->source + global_offset + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]));
std::copy(begin, end, sd->source + global_offset +
((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]));
}
delete[] sd->temporaries[iam];
}
/** @brief Round up to the next greater power of 2.
/** @brief Round up to the next greater power of 2.
* @param x Integer to round up */
template<typename T>
template<typename T>
T
round_up_to_pow2(T x)
{
......@@ -249,16 +251,21 @@ namespace __gnu_parallel
return (T)1 << (log2(x - 1) + 1);
}
/** @brief Main parallel random shuffle step.
/** @brief Main parallel random shuffle step.
* @param begin Begin iterator of sequence.
* @param end End iterator of sequence.
* @param n Length of sequence.
* @param num_threads Number of threads to use.
* @param rng Random number generator to use.
*/
template<typename RandomAccessIterator, typename RandomNumberGenerator>
template<typename RandomAccessIterator, typename RandomNumberGenerator>
inline void
parallel_random_shuffle_drs(RandomAccessIterator begin, RandomAccessIterator end, typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads, RandomNumberGenerator& rng)
parallel_random_shuffle_drs(
RandomAccessIterator begin,
RandomAccessIterator end,
typename std::iterator_traits<RandomAccessIterator>::difference_type n,
thread_index_t num_threads,
RandomNumberGenerator& rng)
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type;
......@@ -275,16 +282,17 @@ namespace __gnu_parallel
// Try the L1 cache first.
// Must fit into L1.
num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type))));
num_bins_cache = std::max<difference_type>(
1, n / (Settings::L1_cache_size_lb / sizeof(value_type)));
num_bins_cache = round_up_to_pow2(num_bins_cache);
// No more buckets than TLB entries, power of 2
// Power of 2 and at least one element per bin, at most the TLB size.
num_bins = std::min(n, (difference_type)num_bins_cache);
num_bins = std::min<difference_type>(n, num_bins_cache);
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
// 2 TLB entries needed per bin.
num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
num_bins = std::min<difference_type>(Settings::TLB_size / 2, num_bins);
#endif
num_bins = round_up_to_pow2(num_bins);
......@@ -293,32 +301,41 @@ namespace __gnu_parallel
#endif
// Now try the L2 cache
// Must fit into L2
num_bins_cache = static_cast<bin_index>(std::max((difference_type)1, (difference_type)(n / (Settings::L2_cache_size / sizeof(value_type)))));
num_bins_cache = static_cast<bin_index>(std::max<difference_type>(
1, n / (Settings::L2_cache_size / sizeof(value_type))));
num_bins_cache = round_up_to_pow2(num_bins_cache);
// No more buckets than TLB entries, power of 2.
num_bins = static_cast<bin_index>(std::min(n, (difference_type)num_bins_cache));
num_bins = static_cast<bin_index>(
std::min(n, static_cast<difference_type>(num_bins_cache)));
// Power of 2 and at least one element per bin, at most the TLB size.
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
// 2 TLB entries needed per bin.
num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
num_bins = std::min(
static_cast<difference_type>(Settings::TLB_size / 2), num_bins);
#endif
num_bins = round_up_to_pow2(num_bins);
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
}
#endif
num_threads = std::min((bin_index)num_threads, (bin_index)num_bins);
num_threads = std::min<bin_index>(num_threads, num_bins);
if (num_threads <= 1)
return sequential_random_shuffle(begin, end, rng);
DRandomShufflingGlobalData<RandomAccessIterator> sd(begin);
DRSSorterPU<RandomAccessIterator, random_number >* pus;
difference_type* starts;
DRSSorterPU<RandomAccessIterator, random_number >* pus = new DRSSorterPU<RandomAccessIterator, random_number >[num_threads];
# pragma omp parallel num_threads(num_threads)
{
# pragma omp single
{
pus = new DRSSorterPU<RandomAccessIterator, random_number>
[num_threads];
sd.temporaries = new value_type*[num_threads];
//sd.oracles = new bin_index[n];
sd.dist = new difference_type*[num_bins + 1];
sd.bin_proc = new thread_index_t[num_bins];
for (bin_index b = 0; b < num_bins + 1; b++)
......@@ -328,34 +345,36 @@ namespace __gnu_parallel
sd.dist[0][0] = 0;
sd.dist[b][0] = 0;
}
difference_type* starts = sd.starts = new difference_type[num_threads + 1];
starts = sd.starts = new difference_type[num_threads + 1];
int bin_cursor = 0;
sd.num_bins = num_bins;
sd.num_bits = log2(num_bins);
difference_type chunk_length = n / num_threads, split = n % num_threads, start = 0;
int bin_chunk_length = num_bins / num_threads, bin_split = num_bins % num_threads;
for (int i = 0; i < num_threads; i++)
difference_type chunk_length = n / num_threads,
split = n % num_threads, start = 0;
difference_type bin_chunk_length = num_bins / num_threads,
bin_split = num_bins % num_threads;
for (thread_index_t i = 0; i < num_threads; i++)
{
starts[i] = start;
start += (i < split) ? (chunk_length + 1) : chunk_length;
int j = pus[i].bins_begin = bin_cursor;
// Range of bins for this processor.
bin_cursor += (i < bin_split) ? (bin_chunk_length + 1) : bin_chunk_length;
bin_cursor += (i < bin_split) ?
(bin_chunk_length + 1) : bin_chunk_length;
pus[i].bins_end = bin_cursor;
for (; j < bin_cursor; j++)
sd.bin_proc[j] = i;
pus[i].num_threads = num_threads;
pus[i].iam = i;
pus[i].seed = rng(std::numeric_limits<uint32>::max());
pus[i].sd = &sd;
}
starts[num_threads] = start;
} //single
// Now shuffle in parallel.
#pragma omp parallel num_threads(num_threads)
parallel_random_shuffle_drs_pu(pus);
}
delete[] starts;
delete[] sd.bin_proc;
......@@ -367,12 +386,12 @@ namespace __gnu_parallel
delete[] pus;
}
/** @brief Sequential cache-efficient random shuffle.
/** @brief Sequential cache-efficient random shuffle.
* @param begin Begin iterator of sequence.
* @param end End iterator of sequence.
* @param rng Random number generator to use.
*/
template<typename RandomAccessIterator, typename RandomNumberGenerator>
template<typename RandomAccessIterator, typename RandomNumberGenerator>
inline void
sequential_random_shuffle(RandomAccessIterator begin,
RandomAccessIterator end,
......@@ -388,7 +407,9 @@ namespace __gnu_parallel
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
// Try the L1 cache first, must fit into L1.
num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type))));
num_bins_cache =
std::max<difference_type>
(1, n / (Settings::L1_cache_size_lb / sizeof(value_type)));
num_bins_cache = round_up_to_pow2(num_bins_cache);
// No more buckets than TLB entries, power of 2
......@@ -404,16 +425,20 @@ namespace __gnu_parallel
{
#endif
// Now try the L2 cache, must fit into L2.
num_bins_cache = static_cast<bin_index>(std::max((difference_type)1, (difference_type)(n / (Settings::L2_cache_size / sizeof(value_type)))));
num_bins_cache =
static_cast<bin_index>(std::max<difference_type>(
1, n / (Settings::L2_cache_size / sizeof(value_type))));
num_bins_cache = round_up_to_pow2(num_bins_cache);
// No more buckets than TLB entries, power of 2
// Power of 2 and at least one element per bin, at most the TLB size.
num_bins = static_cast<bin_index>(std::min(n, (difference_type)num_bins_cache));
num_bins = static_cast<bin_index>
(std::min(n, static_cast<difference_type>(num_bins_cache)));
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
// 2 TLB entries needed per bin
num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
num_bins =
std::min<difference_type>(Settings::TLB_size / 2, num_bins);
#endif
num_bins = round_up_to_pow2(num_bins);
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
......@@ -424,9 +449,11 @@ namespace __gnu_parallel
if (num_bins > 1)
{
value_type* target = static_cast<value_type*>(::operator new(sizeof(value_type) * n));
value_type* target = static_cast<value_type*>(
::operator new(sizeof(value_type) * n));
bin_index* oracles = new bin_index[n];
difference_type* dist0 = new difference_type[num_bins + 1], * dist1 = new difference_type[num_bins + 1];
difference_type* dist0 = new difference_type[num_bins + 1],
* dist1 = new difference_type[num_bins + 1];
for (int b = 0; b < num_bins + 1; b++)
dist0[b] = 0;
......@@ -454,7 +481,8 @@ namespace __gnu_parallel
for (int b = 0; b < num_bins; b++)
{
sequential_random_shuffle(target + dist1[b], target + dist1[b + 1],
sequential_random_shuffle(target + dist1[b],
target + dist1[b + 1],
rng);
}
......@@ -467,14 +495,15 @@ namespace __gnu_parallel
__gnu_sequential::random_shuffle(begin, end, rng);
}
/** @brief Parallel random public call.
/** @brief Parallel random public call.
* @param begin Begin iterator of sequence.
* @param end End iterator of sequence.
* @param rng Random number generator to use.
*/
template<typename RandomAccessIterator, typename RandomNumberGenerator>
template<typename RandomAccessIterator, typename RandomNumberGenerator>
inline void
parallel_random_shuffle(RandomAccessIterator begin, RandomAccessIterator end,
parallel_random_shuffle(RandomAccessIterator begin,
RandomAccessIterator end,
RandomNumberGenerator rng = random_number())
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
......
......@@ -53,7 +53,7 @@ namespace __gnu_parallel
* @param length Length of sequence to search for.
* @param advances Returned offsets.
*/
template<typename RandomAccessIterator, typename _DifferenceTp>
template<typename RandomAccessIterator, typename _DifferenceTp>
void
calc_borders(RandomAccessIterator elements, _DifferenceTp length,
_DifferenceTp* off)
......@@ -81,7 +81,10 @@ namespace __gnu_parallel
* @param end2 End iterator of second sequence.
* @param pred Find predicate.
* @return Place of finding in first sequences. */
template<typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename Pred>
template<
typename _RandomAccessIterator1,
typename _RandomAccessIterator2,
typename Pred>
_RandomAccessIterator1
search_template(_RandomAccessIterator1 begin1, _RandomAccessIterator1 end1,
_RandomAccessIterator2 begin2, _RandomAccessIterator2 end2,
......@@ -103,27 +106,34 @@ namespace __gnu_parallel
// Where is first occurrence of pattern? defaults to end.
difference_type result = (end1 - begin1);
difference_type *splitters;
// Pattern too long.
if (input_length < 0)
return end1;
thread_index_t num_threads = std::max<difference_type>(1, std::min<difference_type>(input_length, __gnu_parallel::get_max_threads()));
omp_lock_t result_lock;
omp_init_lock(&result_lock);
difference_type borders[num_threads + 1];
__gnu_parallel::equally_split(input_length, num_threads, borders);
thread_index_t num_threads =
std::max<difference_type>(1,
std::min<difference_type>(input_length, get_max_threads()));
difference_type advances[pattern_length];
calc_borders(begin2, pattern_length, advances);
#pragma omp parallel num_threads(num_threads)
# pragma omp parallel num_threads(num_threads)
{
# pragma omp single
{
num_threads = omp_get_num_threads();
splitters = new difference_type[num_threads + 1];
equally_split(input_length, num_threads, splitters);
}
thread_index_t iam = omp_get_thread_num();
difference_type start = borders[iam], stop = borders[iam + 1];
difference_type start = splitters[iam], stop = splitters[iam + 1];
difference_type pos_in_pattern = 0;
bool found_pattern = false;
......@@ -131,11 +141,12 @@ namespace __gnu_parallel
while (start <= stop && !found_pattern)
{
// Get new value of result.
#pragma omp flush(result)
#pragma omp flush(result)
// No chance for this thread to find first occurrence.
if (result < start)
break;
while (pred(begin1[start + pos_in_pattern], begin2[pos_in_pattern]))
while (pred(begin1[start + pos_in_pattern],
begin2[pos_in_pattern]))
{
++pos_in_pattern;
if (pos_in_pattern == pattern_length)
......@@ -151,12 +162,15 @@ namespace __gnu_parallel
}
// Make safe jump.
start += (pos_in_pattern - advances[pos_in_pattern]);
pos_in_pattern = (advances[pos_in_pattern] < 0) ? 0 : advances[pos_in_pattern];
}
pos_in_pattern =
(advances[pos_in_pattern] < 0) ? 0 : advances[pos_in_pattern];
}
} //parallel
omp_destroy_lock(&result_lock);
delete[] splitters;
// Return iterator on found element.
return (begin1 + result);
}
......
......@@ -47,7 +47,7 @@
namespace __gnu_parallel
{
template<typename InputIterator, typename OutputIterator>
template<typename InputIterator, typename OutputIterator>
inline OutputIterator
copy_tail(std::pair<InputIterator, InputIterator> b,
std::pair<InputIterator, InputIterator> e, OutputIterator r)
......@@ -68,7 +68,10 @@ namespace __gnu_parallel
return r;
}
template<typename InputIterator, typename OutputIterator, typename Comparator>
template<
typename InputIterator,
typename OutputIterator,
typename Comparator>
struct symmetric_difference_func
{
typedef std::iterator_traits<InputIterator> traits_type;
......@@ -107,7 +110,8 @@ namespace __gnu_parallel
}
inline difference_type
count(InputIterator a, InputIterator b, InputIterator c, InputIterator d) const
count(InputIterator a, InputIterator b, InputIterator c, InputIterator d)
const
{
difference_type counter = 0;
......@@ -144,7 +148,10 @@ namespace __gnu_parallel
};
template<typename InputIterator, typename OutputIterator, typename Comparator>
template<
typename InputIterator,
typename OutputIterator,
typename Comparator>
struct difference_func
{
typedef std::iterator_traits<InputIterator> traits_type;
......@@ -179,7 +186,8 @@ namespace __gnu_parallel
}
inline difference_type
count(InputIterator a, InputIterator b, InputIterator c, InputIterator d) const
count(InputIterator a, InputIterator b, InputIterator c, InputIterator d)
const
{
difference_type counter = 0;
......@@ -209,7 +217,10 @@ namespace __gnu_parallel
};
template<typename InputIterator, typename OutputIterator, typename Comparator>
template<
typename InputIterator,
typename OutputIterator,
typename Comparator>
struct intersection_func
{
typedef std::iterator_traits<InputIterator> traits_type;
......@@ -243,7 +254,8 @@ namespace __gnu_parallel
}
inline difference_type
count(InputIterator a, InputIterator b, InputIterator c, InputIterator d) const
count(InputIterator a, InputIterator b, InputIterator c, InputIterator d)
const
{
difference_type counter = 0;
......@@ -273,10 +285,11 @@ namespace __gnu_parallel
{ return out; }
};
template<class InputIterator, class OutputIterator, class Comparator>
template<class InputIterator, class OutputIterator, class Comparator>
struct union_func
{
typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
typedef typename std::iterator_traits<InputIterator>::difference_type
difference_type;
union_func(Comparator c) : comp(c) {}
......@@ -310,8 +323,8 @@ namespace __gnu_parallel
}
inline difference_type
count(InputIterator a, const InputIterator b, InputIterator c,
const InputIterator d) const
count(InputIterator a, InputIterator b, InputIterator c, InputIterator d)
const
{
difference_type counter = 0;
......@@ -343,7 +356,10 @@ namespace __gnu_parallel
{ return std::copy(a, b, out); }
};
template<typename InputIterator, typename OutputIterator, typename Operation>
template<
typename InputIterator,
typename OutputIterator,
typename Operation>
OutputIterator
parallel_set_operation(InputIterator begin1, InputIterator end1,
InputIterator begin2, InputIterator end2,
......@@ -355,7 +371,6 @@ namespace __gnu_parallel
typedef typename traits_type::difference_type difference_type;
typedef typename std::pair<InputIterator, InputIterator> iterator_pair;
if (begin1 == end1)
return op.first_empty(begin2, end2, result);
......@@ -364,27 +379,35 @@ namespace __gnu_parallel
const difference_type size = (end1 - begin1) + (end2 - begin2);
thread_index_t num_threads = std::min<difference_type>(std::min(end1 - begin1, end2 - begin2), get_max_threads());
difference_type borders[num_threads + 2];
equally_split(size, num_threads + 1, borders);
const iterator_pair sequence[ 2 ] =
{ std::make_pair(begin1, end1), std::make_pair(begin2, end2) } ;
OutputIterator return_value = result;
difference_type *borders;
iterator_pair *block_begins;
difference_type* lengths;
const iterator_pair sequence[ 2 ] = { std::make_pair(begin1, end1), std::make_pair(begin2, end2) } ;
thread_index_t num_threads =
std::min<difference_type>(get_max_threads(),
std::min(end1 - begin1, end2 - begin2));
iterator_pair block_begins[num_threads + 1];
# pragma omp parallel num_threads(num_threads)
{
# pragma omp single
{
num_threads = omp_get_num_threads();
borders = new difference_type[num_threads + 2];
equally_split(size, num_threads + 1, borders);
block_begins = new iterator_pair[num_threads + 1];
// Very start.
block_begins[0] = std::make_pair(begin1, begin2);
difference_type length[num_threads];
lengths = new difference_type[num_threads];
} //single
OutputIterator return_value = result;
thread_index_t iam = omp_get_thread_num();
#pragma omp parallel num_threads(num_threads)
{
// Result from multiseq_partition.
InputIterator offset[2];
const int iam = omp_get_thread_num();
const difference_type rank = borders[iam + 1];
multiseq_partition(sequence, sequence + 2, rank, offset, op.comp);
......@@ -401,10 +424,11 @@ namespace __gnu_parallel
--offset[ 0 ];
}
iterator_pair block_end = block_begins[ iam + 1 ] = iterator_pair(offset[ 0 ], offset[ 1 ]);
iterator_pair block_end = block_begins[ iam + 1 ] =
iterator_pair(offset[ 0 ], offset[ 1 ]);
// Make sure all threads have their block_begin result written out.
#pragma omp barrier
# pragma omp barrier
iterator_pair block_begin = block_begins[ iam ];
......@@ -413,16 +437,19 @@ namespace __gnu_parallel
if (iam == 0)
{
// The first thread can copy already.
length[ iam ] = op.invoke(block_begin.first, block_end.first, block_begin.second, block_end.second, result) - result;
lengths[ iam ] = op.invoke(block_begin.first, block_end.first,
block_begin.second, block_end.second,
result)
- result;
}
else
{
length[ iam ] = op.count(block_begin.first, block_end.first,
lengths[ iam ] = op.count(block_begin.first, block_end.first,
block_begin.second, block_end.second);
}
// Make sure everyone wrote their lengths.
#pragma omp barrier
# pragma omp barrier
OutputIterator r = result;
......@@ -430,18 +457,19 @@ namespace __gnu_parallel
{
// Do the last block.
for (int i = 0; i < num_threads; ++i)
r += length[i];
r += lengths[i];
block_begin = block_begins[num_threads];
// Return the result iterator of the last block.
return_value = op.invoke(block_begin.first, end1, block_begin.second, end2, r);
return_value = op.invoke(
block_begin.first, end1, block_begin.second, end2, r);
}
else
{
for (int i = 0; i < iam; ++i)
r += length[ i ];
r += lengths[ i ];
// Reset begins for copy pass.
op.invoke(block_begin.first, block_end.first,
......@@ -452,7 +480,10 @@ namespace __gnu_parallel
}
template<typename InputIterator, typename OutputIterator, typename Comparator>
template<
typename InputIterator,
typename OutputIterator,
typename Comparator>
OutputIterator
parallel_set_union(InputIterator begin1, InputIterator end1,
InputIterator begin2, InputIterator end2,
......@@ -462,7 +493,10 @@ namespace __gnu_parallel
union_func< InputIterator, OutputIterator, Comparator>(comp));
}
template<typename InputIterator, typename OutputIterator, typename Comparator>
template<
typename InputIterator,
typename OutputIterator,
typename Comparator>
OutputIterator
parallel_set_intersection(InputIterator begin1, InputIterator end1,
InputIterator begin2, InputIterator end2,
......@@ -473,9 +507,11 @@ namespace __gnu_parallel
}
template<typename InputIterator, typename OutputIterator>
template<typename InputIterator, typename OutputIterator>
OutputIterator
set_intersection(InputIterator begin1, InputIterator end1, InputIterator begin2, InputIterator end2, OutputIterator result)
set_intersection(InputIterator begin1, InputIterator end1,
InputIterator begin2, InputIterator end2,
OutputIterator result)
{
typedef std::iterator_traits<InputIterator> traits_type;
typedef typename traits_type::value_type value_type;
......@@ -484,7 +520,10 @@ namespace __gnu_parallel
std::less<value_type>());
}
template<typename InputIterator, typename OutputIterator, typename Comparator>
template<
typename InputIterator,
typename OutputIterator,
typename Comparator>
OutputIterator
parallel_set_difference(InputIterator begin1, InputIterator end1,
InputIterator begin2, InputIterator end2,
......@@ -494,22 +533,20 @@ namespace __gnu_parallel
difference_func<InputIterator, OutputIterator, Comparator>(comp));
}
template<typename InputIterator, typename OutputIterator, typename Comparator>
template<
typename InputIterator,
typename OutputIterator,
typename Comparator>
OutputIterator
parallel_set_symmetric_difference(InputIterator begin1, InputIterator end1, InputIterator begin2, InputIterator end2, OutputIterator result, Comparator comp)
parallel_set_symmetric_difference(InputIterator begin1, InputIterator end1,
InputIterator begin2, InputIterator end2,
OutputIterator result, Comparator comp)
{
return parallel_set_operation(begin1, end1, begin2, end2, result,
symmetric_difference_func<InputIterator, OutputIterator, Comparator>(comp));
symmetric_difference_func<InputIterator, OutputIterator, Comparator>
(comp));
}
}
#endif // _GLIBCXX_SET_ALGORITHM_
......@@ -44,13 +44,16 @@
namespace __gnu_parallel
{
/** @brief Parallel std::unique_copy(), without explicit equality predicate.
/** @brief Parallel std::unique_copy(), w/o explicit equality predicate.
* @param first Begin iterator of input sequence.
* @param last End iterator of input sequence.
* @param result Begin iterator of result sequence.
* @param binary_pred Equality predicate.
* @return End iterator of result sequence. */
template<typename InputIterator, class OutputIterator, class BinaryPredicate>
template<
typename InputIterator,
class OutputIterator,
class BinaryPredicate>
inline OutputIterator
parallel_unique_copy(InputIterator first, InputIterator last,
OutputIterator result, BinaryPredicate binary_pred)
......@@ -62,20 +65,27 @@ namespace __gnu_parallel
typedef typename traits_type::difference_type difference_type;
difference_type size = last - first;
int num_threads = __gnu_parallel::get_max_threads();
difference_type counter[num_threads + 1];
if (size == 0)
return result;
// Let the first thread process two parts.
difference_type borders[num_threads + 2];
__gnu_parallel::equally_split(size, num_threads + 1, borders);
difference_type *counter;
difference_type *borders;
thread_index_t num_threads = get_max_threads();
// First part contains at least one element.
#pragma omp parallel num_threads(num_threads)
# pragma omp parallel num_threads(num_threads)
{
int iam = omp_get_thread_num();
# pragma omp single
{
num_threads = omp_get_num_threads();
borders = new difference_type[num_threads + 2];
equally_split(size, num_threads + 1, borders);
counter = new difference_type[num_threads + 1];
}
thread_index_t iam = omp_get_thread_num();
difference_type begin, end;
......@@ -83,6 +93,7 @@ namespace __gnu_parallel
// Needed for position in output
difference_type i = 0;
OutputIterator out = result;
if (iam == 0)
{
begin = borders[0] + 1; // == 1
......@@ -120,7 +131,7 @@ namespace __gnu_parallel
// Last part still untouched.
difference_type begin_output;
#pragma omp barrier
# pragma omp barrier
// Store result in output on calculated positions.
begin_output = 0;
......@@ -170,15 +181,17 @@ namespace __gnu_parallel
for (int t = 0; t < num_threads + 1; t++)
end_output += counter[t];
delete[] borders;
return result + end_output;
}
/** @brief Parallel std::unique_copy(), without explicit equality predicate
/** @brief Parallel std::unique_copy(), without explicit equality predicate
* @param first Begin iterator of input sequence.
* @param last End iterator of input sequence.
* @param result Begin iterator of result sequence.
* @return End iterator of result sequence. */
template<typename InputIterator, class OutputIterator>
template<typename InputIterator, class OutputIterator>
inline OutputIterator
parallel_unique_copy(InputIterator first, InputIterator last,
OutputIterator result)
......
......@@ -55,8 +55,8 @@ namespace __gnu_parallel
#define _GLIBCXX_JOB_VOLATILE volatile
/** @brief One job for a certain thread. */
template<typename _DifferenceTp>
/** @brief One job for a certain thread. */
template<typename _DifferenceTp>
struct Job
{
typedef _DifferenceTp difference_type;
......@@ -78,7 +78,7 @@ namespace __gnu_parallel
_GLIBCXX_JOB_VOLATILE difference_type load;
};
/** @brief Work stealing algorithm for random access iterators.
/** @brief Work stealing algorithm for random access iterators.
*
* Uses O(1) additional memory. Synchronization at job lists is
* done with atomic operations.
......@@ -96,13 +96,20 @@ namespace __gnu_parallel
* std::count_n()).
* @return User-supplied functor (that may contain a part of the result).
*/
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
template<
typename RandomAccessIterator,
typename Op,
typename Fu,
typename Red,
typename Result>
Op
for_each_template_random_access_workstealing(RandomAccessIterator begin,
for_each_template_random_access_workstealing(
RandomAccessIterator begin,
RandomAccessIterator end,
Op op, Fu& f, Red r,
Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
typename std::iterator_traits<RandomAccessIterator>::difference_type
bound)
{
_GLIBCXX_CALL(end - begin)
......@@ -110,34 +117,43 @@ namespace __gnu_parallel
typedef typename traits_type::difference_type difference_type;
difference_type chunk_size = static_cast<difference_type>(Settings::workstealing_chunk_size);
difference_type chunk_size =
static_cast<difference_type>(Settings::workstealing_chunk_size);
// How many jobs?
difference_type length = (bound < 0) ? (end - begin) : bound;
// To avoid false sharing in a cache line.
const int stride = Settings::cache_line_size * 10 / sizeof(Job<difference_type>) + 1;
const int stride =
Settings::cache_line_size * 10 / sizeof(Job<difference_type>) + 1;
// Total number of threads currently working.
thread_index_t busy = 0;
thread_index_t num_threads = get_max_threads();
difference_type num_threads_min = num_threads < end - begin ? num_threads : end - begin;
Job<difference_type> *job;
omp_lock_t output_lock;
omp_init_lock(&output_lock);
// No more threads than jobs, at least one thread.
difference_type num_threads_max = num_threads_min > 1 ? num_threads_min : 1;
num_threads = static_cast<thread_index_t>(num_threads_max);
// Create job description array.
Job<difference_type> *job = new Job<difference_type>[num_threads * stride];
// Write base value to output.
output = base;
#pragma omp parallel shared(busy) num_threads(num_threads)
// No more threads than jobs, at least one thread.
thread_index_t num_threads =
__gnu_parallel::max<thread_index_t>(1,
__gnu_parallel::min<difference_type>(length, get_max_threads()));
# pragma omp parallel shared(busy) num_threads(num_threads)
{
# pragma omp single
{
num_threads = omp_get_num_threads();
// Create job description array.
job = new Job<difference_type>[num_threads * stride];
}
// Initialization phase.
// Flags for every thread if it is doing productive work.
......@@ -158,19 +174,22 @@ namespace __gnu_parallel
// Number of elements to steal in one attempt.
difference_type steal;
// Every thread has its own random number generator (modulo num_threads).
// Every thread has its own random number generator
// (modulo num_threads).
random_number rand_gen(iam, num_threads);
#pragma omp atomic
// This thread is currently working.
# pragma omp atomic
busy++;
iam_working = true;
// How many jobs per thread? last thread gets the rest.
my_job.first = static_cast<difference_type>(iam * (length / num_threads));
my_job.first =
static_cast<difference_type>(iam * (length / num_threads));
my_job.last = (iam == (num_threads - 1)) ? (length - 1) : ((iam + 1) * (length / num_threads) - 1);
my_job.last = (iam == (num_threads - 1)) ?
(length - 1) : ((iam + 1) * (length / num_threads) - 1);
my_job.load = my_job.last - my_job.first + 1;
// Init result with first value (to have a base value for reduction).
......@@ -185,26 +204,29 @@ namespace __gnu_parallel
RandomAccessIterator current;
#pragma omp barrier
# pragma omp barrier
// Actual work phase
// Work on own or stolen start
while (busy > 0)
{
// Work until no productive thread left.
#pragma omp flush(busy)
# pragma omp flush(busy)
// Thread has own work to do
while (my_job.first <= my_job.last)
{
// fetch-and-add call
// Reserve current job block (size chunk_size) in my queue.
difference_type current_job = fetch_and_add<difference_type>(&(my_job.first), chunk_size);
difference_type current_job =
fetch_and_add<difference_type>(&(my_job.first), chunk_size);
// Update load, to make the three values consistent,
// first might have been changed in the meantime
my_job.load = my_job.last - my_job.first + 1;
for (difference_type job_counter = 0; job_counter < chunk_size && current_job <= my_job.last; job_counter++)
for (difference_type job_counter = 0;
job_counter < chunk_size && current_job <= my_job.last;
job_counter++)
{
// Yes: process it!
current = begin + current_job;
......@@ -214,15 +236,14 @@ namespace __gnu_parallel
result = r(result, f(op, current));
}
#pragma omp flush(busy)
# pragma omp flush(busy)
}
// After reaching this point, a thread's job list is empty.
if (iam_working)
{
#pragma omp atomic
// This thread no longer has work.
# pragma omp atomic
busy--;
iam_working = false;
......@@ -231,16 +252,17 @@ namespace __gnu_parallel
difference_type supposed_first, supposed_last, supposed_load;
do
{
// Find random nonempty deque (not own) and do consistency check.
// Find random nonempty deque (not own), do consistency check.
yield();
#pragma omp flush(busy)
# pragma omp flush(busy)
victim = rand_gen();
supposed_first = job[victim * stride].first;
supposed_last = job[victim * stride].last;
supposed_load = job[victim * stride].load;
}
while (busy > 0
&& ((supposed_load <= 0) || ((supposed_first + supposed_load - 1) != supposed_last)));
&& ((supposed_load <= 0)
|| ((supposed_first + supposed_load - 1) != supposed_last)));
if (busy == 0)
break;
......@@ -251,40 +273,30 @@ namespace __gnu_parallel
// Number of elements to steal (at least one).
steal = (supposed_load < 2) ? 1 : supposed_load / 2;
// Protects against stealing threads
// omp_set_lock(&(job[victim * stride].lock));
// Push victim's start forward.
difference_type stolen_first = fetch_and_add<difference_type>(&(job[victim * stride].first), steal);
difference_type stolen_try = stolen_first + steal - difference_type(1);
// Protects against working thread
// omp_unset_lock(&(job[victim * stride].lock));
difference_type stolen_first =
fetch_and_add<difference_type>(
&(job[victim * stride].first), steal);
difference_type stolen_try =
stolen_first + steal - difference_type(1);
my_job.first = stolen_first;
// Avoid std::min dependencies.
my_job.last = stolen_try < supposed_last ? stolen_try : supposed_last;
my_job.last = __gnu_parallel::min(stolen_try, supposed_last);
my_job.load = my_job.last - my_job.first + 1;
//omp_unset_lock(&(my_job.lock));
#pragma omp atomic
// Has potential work again.
# pragma omp atomic
busy++;
iam_working = true;
#pragma omp flush(busy)
# pragma omp flush(busy)
}
#pragma omp flush(busy)
# pragma omp flush(busy)
} // end while busy > 0
// Add accumulated result to output.
omp_set_lock(&output_lock);
output = r(output, result);
omp_unset_lock(&output_lock);
//omp_destroy_lock(&(my_job.lock));
}
delete[] job;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment