Commit e683ee2a by Johannes Singler Committed by Johannes Singler

re PR libstdc++/33893 ([parallel mode] Algorithms rely on omp_set_dynamic(false))

2007-11-22  Johannes Singler  <singler@ira.uka.de>

        PR libstdc++/33893
        * include/parallel/multiway_merge.h: made omp_dynamic-safe
        * include/parallel/workstealing.h: made omp_dynamic-safe
        * include/parallel/base.h: infrastructure, cleanup
        * include/parallel/par_loop.h: made omp_dynamic-safe
        * include/parallel/features.h: activate loser tree variant
        * include/parallel/quicksort.h: made omp_dynamic-safe
        * include/parallel/compiletime_settings.h: settings overridable
        * include/parallel/equally_split.h: made omp_dynamic-safe
        * include/parallel/omp_loop_static.h: made omp_dynamic-safe
        * include/parallel/random_shuffle.h: made omp_dynamic-safe
        * include/parallel/balanced_quicksort.h: made omp_dynamic-safe
        * include/parallel/set_operations.h: made omp_dynamic-safe
        * include/parallel/unique_copy.h: made omp_dynamic-safe
        * include/parallel/multiway_mergesort.h: made omp_dynamic-safe
        * include/parallel/search.h: made omp_dynamic-safe
        * include/parallel/partition.h: made omp_dynamic-safe
        * include/parallel/partial_sum.h: made omp_dynamic-safe
        * include/parallel/find.h: made omp_dynamic-safe
        * include/parallel/omp_loop.h: made omp_dynamic-safe
        * include/parallel/losertree.h: avoid default constructor

From-SVN: r130347
parent 7861a5ce
2007-11-22 Johannes Singler <singler@ira.uka.de>
PR libstdc++/33893
* include/parallel/multiway_merge.h: made omp_dynamic-safe
* include/parallel/workstealing.h: made omp_dynamic-safe
* include/parallel/base.h: infrastructure, cleanup
* include/parallel/par_loop.h: made omp_dynamic-safe
* include/parallel/features.h: activate loser tree variant
* include/parallel/quicksort.h: made omp_dynamic-safe
* include/parallel/compiletime_settings.h: settings overridable
* include/parallel/equally_split.h: made omp_dynamic-safe
* include/parallel/omp_loop_static.h: made omp_dynamic-safe
* include/parallel/random_shuffle.h: made omp_dynamic-safe
* include/parallel/balanced_quicksort.h: made omp_dynamic-safe
* include/parallel/set_operations.h: made omp_dynamic-safe
* include/parallel/unique_copy.h: made omp_dynamic-safe
* include/parallel/multiway_mergesort.h: made omp_dynamic-safe
* include/parallel/search.h: made omp_dynamic-safe
* include/parallel/partition.h: made omp_dynamic-safe
* include/parallel/partial_sum.h: made omp_dynamic-safe
* include/parallel/find.h: made omp_dynamic-safe
* include/parallel/omp_loop.h: made omp_dynamic-safe
* include/parallel/losertree.h: avoid default constructor
2007-11-21 Jonathan Wakely <jwakely.gcc@gmail.com> 2007-11-21 Jonathan Wakely <jwakely.gcc@gmail.com>
* docs/html/17_intro/C++STYLE: Fix typos. * docs/html/17_intro/C++STYLE: Fix typos.
......
...@@ -39,7 +39,7 @@ ...@@ -39,7 +39,7 @@
#include <cstdio> #include <cstdio>
/** @brief Determine verbosity level of the parallel mode. /** @brief Determine verbosity level of the parallel mode.
* Level 1 prints a message each time when entering a parallel-mode function. */ * Level 1 prints a message each time a parallel-mode function is entered. */
#define _GLIBCXX_VERBOSE_LEVEL 0 #define _GLIBCXX_VERBOSE_LEVEL 0
/** @def _GLIBCXX_CALL /** @def _GLIBCXX_CALL
...@@ -50,27 +50,40 @@ ...@@ -50,27 +50,40 @@
#define _GLIBCXX_CALL(n) #define _GLIBCXX_CALL(n)
#endif #endif
#if (_GLIBCXX_VERBOSE_LEVEL == 1) #if (_GLIBCXX_VERBOSE_LEVEL == 1)
#define _GLIBCXX_CALL(n) printf(" %s:\niam = %d, n = %ld, num_threads = %d\n", __PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads()); #define _GLIBCXX_CALL(n) \
printf(" %s:\niam = %d, n = %ld, num_threads = %d\n", \
__PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads());
#endif #endif
#ifndef _GLIBCXX_SCALE_DOWN_FPU
/** @brief Use floating-point scaling instead of modulo for mapping /** @brief Use floating-point scaling instead of modulo for mapping
* random numbers to a range. This can be faster on certain CPUs. */ * random numbers to a range. This can be faster on certain CPUs. */
#define _GLIBCXX_SCALE_DOWN_FPU 0 #define _GLIBCXX_SCALE_DOWN_FPU 0
#endif
#ifndef _GLIBCXX_ASSERTIONS
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code. /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
* Should be switched on only locally. */ * Should be switched on only locally. */
#define _GLIBCXX_ASSERTIONS 0 #define _GLIBCXX_ASSERTIONS 0
#endif
#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code. /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
* Consider the size of the L1 cache for __gnu_parallel::parallel_random_shuffle(). */ * Consider the size of the L1 cache for
* __gnu_parallel::parallel_random_shuffle(). */
#define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 0 #define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 0
#endif
#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code. /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
* Consider the size of the TLB for __gnu_parallel::parallel_random_shuffle(). */ * Consider the size of the TLB for
* __gnu_parallel::parallel_random_shuffle(). */
#define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 0 #define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 0
#endif
#ifndef _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
/** @brief First copy the data, sort it locally, and merge it back /** @brief First copy the data, sort it locally, and merge it back
* (0); or copy it back after everything is done (1). * (0); or copy it back after everything is done (1).
* *
* Recommendation: 0 */ * Recommendation: 0 */
#define _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST 0 #define _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST 0
#endif
...@@ -39,30 +39,58 @@ ...@@ -39,30 +39,58 @@
namespace __gnu_parallel namespace __gnu_parallel
{ {
/** @brief Function to split a sequence into parts of almost equal size. /** @brief Function to split a sequence into parts of almost equal size.
* *
* The resulting sequence s of length p+1 contains the splitting * The resulting sequence s of length num_threads+1 contains the splitting
* positions when splitting the range [0,n) into parts of almost * positions when splitting the range [0,n) into parts of almost
* equal size (plus minus 1). The first entry is 0, the last one * equal size (plus minus 1). The first entry is 0, the last one
* n. There may result empty parts. * n. There may result empty parts.
* @param n Number of elements * @param n Number of elements
* @param p Number of parts * @param num_threads Number of parts
* @param s Splitters * @param s Splitters
* @returns End of splitter sequence, i. e. @c s+p+1 */ * @returns End of splitter sequence, i. e. @c s+num_threads+1 */
template<typename _DifferenceTp, typename OutputIterator> template<typename difference_type, typename OutputIterator>
OutputIterator OutputIterator
equally_split(_DifferenceTp n, thread_index_t p, OutputIterator s) equally_split(difference_type n,
thread_index_t num_threads,
OutputIterator s)
{ {
typedef _DifferenceTp difference_type; difference_type chunk_length = n / num_threads,
difference_type chunk_length = n / p, split = n % p, start = 0; num_longer_chunks = n % num_threads,
for (int i = 0; i < p; i++) pos = 0;
for (thread_index_t i = 0; i < num_threads; ++i)
{ {
*s++ = start; *s++ = pos;
start += (difference_type(i) < split) ? (chunk_length + 1) : chunk_length; pos += (i < num_longer_chunks) ? (chunk_length + 1) : chunk_length;
} }
*s++ = n; *s++ = n;
return s; return s;
} }
/** @brief Function to split a sequence into parts of almost equal size.
*
* Returns the position of the splitting point between
* thread number thread_no (included) and
* thread number thread_no+1 (excluded).
* @param n Number of elements
* @param num_threads Number of parts
* @returns Splitting point */
template<typename difference_type>
difference_type
equally_split_point(difference_type n,
thread_index_t num_threads,
thread_index_t thread_no)
{
difference_type chunk_length = n / num_threads,
num_longer_chunks = n % num_threads;
if(thread_no < num_longer_chunks)
return thread_no * (chunk_length + 1);
else
return num_longer_chunks * (chunk_length + 1)
+ (thread_no - num_longer_chunks) * chunk_length;
}
} }
#endif #endif
...@@ -66,7 +66,7 @@ ...@@ -66,7 +66,7 @@
* @brief Include guarded (sequences may run empty) loser tree, * @brief Include guarded (sequences may run empty) loser tree,
* moving objects. * moving objects.
* @see __gnu_parallel::Settings multiway_merge_algorithm */ * @see __gnu_parallel::Settings multiway_merge_algorithm */
#define _GLIBCXX_LOSER_TREE 0 #define _GLIBCXX_LOSER_TREE 1
#endif #endif
#ifndef _GLIBCXX_LOSER_TREE_EXPLICIT #ifndef _GLIBCXX_LOSER_TREE_EXPLICIT
......
...@@ -43,10 +43,11 @@ ...@@ -43,10 +43,11 @@
#include <parallel/settings.h> #include <parallel/settings.h>
#include <parallel/basic_iterator.h> #include <parallel/basic_iterator.h>
#include <parallel/base.h>
namespace __gnu_parallel namespace __gnu_parallel
{ {
/** @brief Embarrassingly parallel algorithm for random access /** @brief Embarrassingly parallel algorithm for random access
* iterators, using an OpenMP for loop. * iterators, using an OpenMP for loop.
* *
* @param begin Begin iterator of element sequence. * @param begin Begin iterator of element sequence.
...@@ -63,34 +64,50 @@ namespace __gnu_parallel ...@@ -63,34 +64,50 @@ namespace __gnu_parallel
* std::count_n()). * std::count_n()).
* @return User-supplied functor (that may contain a part of the result). * @return User-supplied functor (that may contain a part of the result).
*/ */
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result> template<typename RandomAccessIterator,
typename Op,
typename Fu,
typename Red,
typename Result>
Op Op
for_each_template_random_access_omp_loop(RandomAccessIterator begin, RandomAccessIterator end, Op o, Fu& f, Red r, Result base, Result& output, typename std::iterator_traits<RandomAccessIterator>::difference_type bound) for_each_template_random_access_omp_loop(
RandomAccessIterator begin,
RandomAccessIterator end,
Op o, Fu& f, Red r, Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::
difference_type bound)
{ {
typedef typename std::iterator_traits<RandomAccessIterator>::difference_type difference_type; typedef typename
std::iterator_traits<RandomAccessIterator>::difference_type
difference_type;
thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : static_cast<thread_index_t>((end - begin));
Result *thread_results = new Result[num_threads];
difference_type length = end - begin; difference_type length = end - begin;
thread_index_t num_threads =
__gnu_parallel::min<difference_type>(get_max_threads(), length);
for (thread_index_t i = 0; i < num_threads; i++) Result *thread_results;
{
thread_results[i] = r(thread_results[i], f(o, begin+i));
}
#pragma omp parallel num_threads(num_threads) # pragma omp parallel num_threads(num_threads)
{ {
#pragma omp for schedule(dynamic, Settings::workstealing_chunk_size) # pragma omp single
for (difference_type pos = 0; pos < length; pos++)
{ {
thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos)); num_threads = omp_get_num_threads();
} thread_results = new Result[num_threads];
for (thread_index_t i = 0; i < num_threads; i++)
thread_results[i] = Result();
} }
thread_index_t iam = omp_get_thread_num();
# pragma omp for schedule(dynamic, Settings::workstealing_chunk_size)
for (difference_type pos = 0; pos < length; pos++)
thread_results[iam] =
r(thread_results[iam], f(o, begin+pos));
} //parallel
for (thread_index_t i = 0; i < num_threads; i++) for (thread_index_t i = 0; i < num_threads; i++)
{
output = r(output, thread_results[i]); output = r(output, thread_results[i]);
}
delete [] thread_results; delete [] thread_results;
...@@ -100,6 +117,7 @@ namespace __gnu_parallel ...@@ -100,6 +117,7 @@ namespace __gnu_parallel
return o; return o;
} }
} // end namespace } // end namespace
#endif #endif
...@@ -64,39 +64,50 @@ namespace __gnu_parallel ...@@ -64,39 +64,50 @@ namespace __gnu_parallel
* std::count_n()). * std::count_n()).
* @return User-supplied functor (that may contain a part of the result). * @return User-supplied functor (that may contain a part of the result).
*/ */
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result> template<typename RandomAccessIterator,
typename Op,
typename Fu,
typename Red,
typename Result>
Op Op
for_each_template_random_access_omp_loop_static(RandomAccessIterator begin, for_each_template_random_access_omp_loop_static(
RandomAccessIterator begin,
RandomAccessIterator end, RandomAccessIterator end,
Op o, Fu& f, Red r, Op o, Fu& f, Red r, Result base, Result& output,
Result base, Result& output, typename std::iterator_traits<RandomAccessIterator>::
typename std::iterator_traits<RandomAccessIterator>::difference_type bound) difference_type bound)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef typename
typedef typename traits_type::difference_type difference_type; std::iterator_traits<RandomAccessIterator>::difference_type
difference_type;
thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : (end - begin);
Result *thread_results = new Result[num_threads];
difference_type length = end - begin; difference_type length = end - begin;
thread_index_t num_threads =
std::min<difference_type>(get_max_threads(), length);
for (thread_index_t i = 0; i < num_threads; i++) Result *thread_results;
{
thread_results[i] = r(thread_results[i], f(o, begin+i));
}
#pragma omp parallel num_threads(num_threads) # pragma omp parallel num_threads(num_threads)
{ {
#pragma omp for schedule(static, Settings::workstealing_chunk_size) # pragma omp single
for (difference_type pos = 0; pos < length; pos++)
{ {
thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos)); num_threads = omp_get_num_threads();
} thread_results = new Result[num_threads];
for (thread_index_t i = 0; i < num_threads; i++)
thread_results[i] = Result();
} }
thread_index_t iam = omp_get_thread_num();
# pragma omp for schedule(static, Settings::workstealing_chunk_size)
for (difference_type pos = 0; pos < length; pos++)
thread_results[iam] =
r(thread_results[iam], f(o, begin+pos));
} //parallel
for (thread_index_t i = 0; i < num_threads; i++) for (thread_index_t i = 0; i < num_threads; i++)
{
output = r(output, thread_results[i]); output = r(output, thread_results[i]);
}
delete [] thread_results; delete [] thread_results;
...@@ -106,6 +117,7 @@ namespace __gnu_parallel ...@@ -106,6 +117,7 @@ namespace __gnu_parallel
return o; return o;
} }
} // end namespace } // end namespace
#endif #endif
...@@ -41,11 +41,12 @@ ...@@ -41,11 +41,12 @@
#include <omp.h> #include <omp.h>
#include <parallel/settings.h> #include <parallel/settings.h>
#include <parallel/base.h>
namespace __gnu_parallel namespace __gnu_parallel
{ {
/** @brief Embarrassingly parallel algorithm for random access /** @brief Embarrassingly parallel algorithm for random access
* iterators, using hand-crafted parallelization by equal splitting * iterators, using hand-crafted parallelization by equal splitting
* the work. * the work.
* *
...@@ -63,47 +64,57 @@ namespace __gnu_parallel ...@@ -63,47 +64,57 @@ namespace __gnu_parallel
* std::count_n()). * std::count_n()).
* @return User-supplied functor (that may contain a part of the result). * @return User-supplied functor (that may contain a part of the result).
*/ */
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result> template<
typename RandomAccessIterator,
typename Op,
typename Fu,
typename Red,
typename Result>
Op Op
for_each_template_random_access_ed(RandomAccessIterator begin, for_each_template_random_access_ed(
RandomAccessIterator end, Op o, Fu& f, RandomAccessIterator begin,
Red r, Result base, Result& output, RandomAccessIterator end,
typename std::iterator_traits<RandomAccessIterator>::difference_type bound) Op o, Fu& f, Red r, Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::
difference_type bound)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
const difference_type length = end - begin; const difference_type length = end - begin;
const difference_type settings_threads = static_cast<difference_type>(get_max_threads()); Result *thread_results;
const difference_type dmin = settings_threads < length ? settings_threads : length;
const difference_type dmax = dmin > 1 ? dmin : 1;
thread_index_t num_threads = static_cast<thread_index_t>(dmax); thread_index_t num_threads =
__gnu_parallel::min<difference_type>(get_max_threads(), length);
# pragma omp parallel num_threads(num_threads)
{
# pragma omp single
{
num_threads = omp_get_num_threads();
thread_results = new Result[num_threads];
}
Result *thread_results = new Result[num_threads]; thread_index_t iam = omp_get_thread_num();
#pragma omp parallel num_threads(num_threads)
{
// Neutral element. // Neutral element.
Result reduct = Result(); Result reduct = Result();
thread_index_t p = num_threads; difference_type
thread_index_t iam = omp_get_thread_num(); start = equally_split_point(length, num_threads, iam),
difference_type start = iam * length / p; stop = equally_split_point(length, num_threads, iam + 1);
difference_type limit = (iam == p - 1) ? length : (iam + 1) * length / p;
if (start < limit) if (start < stop)
{ {
reduct = f(o, begin + start); reduct = f(o, begin + start);
start++; ++start;
} }
for (; start < limit; start++) for (; start < stop; ++start)
reduct = r(reduct, f(o, begin + start)); reduct = r(reduct, f(o, begin + start));
thread_results[iam] = reduct; thread_results[iam] = reduct;
} } //parallel
for (thread_index_t i = 0; i < num_threads; i++) for (thread_index_t i = 0; i < num_threads; i++)
output = r(output, thread_results[i]); output = r(output, thread_results[i]);
......
...@@ -48,7 +48,7 @@ namespace __gnu_parallel ...@@ -48,7 +48,7 @@ namespace __gnu_parallel
{ {
// Problem: there is no 0-element given. // Problem: there is no 0-element given.
/** @brief Base case prefix sum routine. /** @brief Base case prefix sum routine.
* @param begin Begin iterator of input sequence. * @param begin Begin iterator of input sequence.
* @param end End iterator of input sequence. * @param end End iterator of input sequence.
* @param result Begin iterator of output sequence. * @param result Begin iterator of output sequence.
...@@ -56,9 +56,13 @@ namespace __gnu_parallel ...@@ -56,9 +56,13 @@ namespace __gnu_parallel
* @param value Start value. Must be passed since the neutral * @param value Start value. Must be passed since the neutral
* element is unknown in general. * element is unknown in general.
* @return End iterator of output sequence. */ * @return End iterator of output sequence. */
template<typename InputIterator, typename OutputIterator, typename BinaryOperation> template<
typename InputIterator,
typename OutputIterator,
typename BinaryOperation>
inline OutputIterator inline OutputIterator
parallel_partial_sum_basecase(InputIterator begin, InputIterator end, parallel_partial_sum_basecase(
InputIterator begin, InputIterator end,
OutputIterator result, BinaryOperation bin_op, OutputIterator result, BinaryOperation bin_op,
typename std::iterator_traits<InputIterator>::value_type value) typename std::iterator_traits<InputIterator>::value_type value)
{ {
...@@ -75,7 +79,7 @@ namespace __gnu_parallel ...@@ -75,7 +79,7 @@ namespace __gnu_parallel
return result; return result;
} }
/** @brief Parallel partial sum implementation, two-phase approach, /** @brief Parallel partial sum implementation, two-phase approach,
no recursion. no recursion.
* @param begin Begin iterator of input sequence. * @param begin Begin iterator of input sequence.
* @param end End iterator of input sequence. * @param end End iterator of input sequence.
...@@ -85,31 +89,49 @@ namespace __gnu_parallel ...@@ -85,31 +89,49 @@ namespace __gnu_parallel
* @param num_threads Number of threads to use. * @param num_threads Number of threads to use.
* @return End iterator of output sequence. * @return End iterator of output sequence.
*/ */
template<typename InputIterator, typename OutputIterator, typename BinaryOperation> template<
typename InputIterator,
typename OutputIterator,
typename BinaryOperation>
OutputIterator OutputIterator
parallel_partial_sum_linear(InputIterator begin, InputIterator end, parallel_partial_sum_linear(
InputIterator begin, InputIterator end,
OutputIterator result, BinaryOperation bin_op, OutputIterator result, BinaryOperation bin_op,
typename std::iterator_traits<InputIterator>::difference_type n, int num_threads) typename std::iterator_traits<InputIterator>::difference_type n)
{ {
typedef std::iterator_traits<InputIterator> traits_type; typedef std::iterator_traits<InputIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
if (num_threads > (n - 1)) thread_index_t num_threads =
num_threads = static_cast<thread_index_t>(n - 1); std::min<difference_type>(get_max_threads(), n - 1);
if (num_threads < 2) if (num_threads < 2)
{ {
*result = *begin; *result = *begin;
return parallel_partial_sum_basecase(begin + 1, end, result + 1, bin_op, *begin); return parallel_partial_sum_basecase(
begin + 1, end, result + 1, bin_op, *begin);
} }
difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 2))); difference_type* borders;
value_type* sums;
# pragma omp parallel num_threads(num_threads)
{
# pragma omp single
{
num_threads = omp_get_num_threads();
borders = new difference_type[num_threads + 2];
if (Settings::partial_sum_dilatation == 1.0f) if (Settings::partial_sum_dilatation == 1.0f)
equally_split(n, num_threads + 1, borders); equally_split(n, num_threads + 1, borders);
else else
{ {
difference_type chunk_length = (int)((double)n / ((double)num_threads + Settings::partial_sum_dilatation)), borderstart = n - num_threads * chunk_length; difference_type chunk_length =
((double)n /
((double)num_threads + Settings::partial_sum_dilatation)),
borderstart = n - num_threads * chunk_length;
borders[0] = 0; borders[0] = 0;
for (int i = 1; i < (num_threads + 1); i++) for (int i = 1; i < (num_threads + 1); i++)
{ {
...@@ -119,13 +141,13 @@ namespace __gnu_parallel ...@@ -119,13 +141,13 @@ namespace __gnu_parallel
borders[num_threads + 1] = n; borders[num_threads + 1] = n;
} }
value_type* sums = static_cast<value_type*>(::operator new(sizeof(value_type) * num_threads)); sums = static_cast<value_type*>(
::operator new(sizeof(value_type) * num_threads));
OutputIterator target_end; OutputIterator target_end;
} //single
#pragma omp parallel num_threads(num_threads) int iam = omp_get_thread_num();
{ if (iam == 0)
int id = omp_get_thread_num();
if (id == 0)
{ {
*result = *begin; *result = *begin;
parallel_partial_sum_basecase(begin + 1, begin + borders[1], parallel_partial_sum_basecase(begin + 1, begin + borders[1],
...@@ -134,44 +156,48 @@ namespace __gnu_parallel ...@@ -134,44 +156,48 @@ namespace __gnu_parallel
} }
else else
{ {
sums[id] = std::accumulate(begin + borders[id] + 1, sums[iam] = std::accumulate(begin + borders[iam] + 1,
begin + borders[id + 1], begin + borders[iam + 1],
*(begin + borders[id]), *(begin + borders[iam]),
bin_op, __gnu_parallel::sequential_tag()); bin_op, __gnu_parallel::sequential_tag());
} }
#pragma omp barrier # pragma omp barrier
#pragma omp single # pragma omp single
parallel_partial_sum_basecase(sums + 1, sums + num_threads, sums + 1, parallel_partial_sum_basecase(
bin_op, sums[0]); sums + 1, sums + num_threads, sums + 1, bin_op, sums[0]);
#pragma omp barrier # pragma omp barrier
// Still same team. // Still same team.
parallel_partial_sum_basecase(begin + borders[id + 1], parallel_partial_sum_basecase(begin + borders[iam + 1],
begin + borders[id + 2], begin + borders[iam + 2],
result + borders[id + 1], bin_op, result + borders[iam + 1], bin_op,
sums[id]); sums[iam]);
} } //parallel
delete [] sums; delete[] sums;
delete[] borders;
return result + n; return result + n;
} }
/** @brief Parallel partial sum front-end. /** @brief Parallel partial sum front-end.
* @param begin Begin iterator of input sequence. * @param begin Begin iterator of input sequence.
* @param end End iterator of input sequence. * @param end End iterator of input sequence.
* @param result Begin iterator of output sequence. * @param result Begin iterator of output sequence.
* @param bin_op Associative binary function. * @param bin_op Associative binary function.
* @return End iterator of output sequence. */ * @return End iterator of output sequence. */
template<typename InputIterator, typename OutputIterator, typename BinaryOperation> template<
typename InputIterator,
typename OutputIterator,
typename BinaryOperation>
OutputIterator OutputIterator
parallel_partial_sum(InputIterator begin, InputIterator end, parallel_partial_sum(InputIterator begin, InputIterator end,
OutputIterator result, BinaryOperation bin_op) OutputIterator result, BinaryOperation bin_op)
{ {
_GLIBCXX_CALL(begin - end); _GLIBCXX_CALL(begin - end)
typedef std::iterator_traits<InputIterator> traits_type; typedef std::iterator_traits<InputIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
...@@ -179,14 +205,11 @@ namespace __gnu_parallel ...@@ -179,14 +205,11 @@ namespace __gnu_parallel
difference_type n = end - begin; difference_type n = end - begin;
int num_threads = get_max_threads();
switch (Settings::partial_sum_algorithm) switch (Settings::partial_sum_algorithm)
{ {
case Settings::LINEAR: case Settings::LINEAR:
// Need an initial offset. // Need an initial offset.
return parallel_partial_sum_linear(begin, end, result, bin_op, return parallel_partial_sum_linear(begin, end, result, bin_op, n);
n, num_threads);
default: default:
// Partial_sum algorithm not implemented. // Partial_sum algorithm not implemented.
_GLIBCXX_PARALLEL_ASSERT(0); _GLIBCXX_PARALLEL_ASSERT(0);
......
...@@ -45,21 +45,21 @@ ...@@ -45,21 +45,21 @@
#include <bits/stl_algo.h> #include <bits/stl_algo.h>
#include <parallel/parallel.h> #include <parallel/parallel.h>
/** @brief Decide whether to declare certain variable volatile in this file. */ /** @brief Decide whether to declare certain variables volatile. */
#define _GLIBCXX_VOLATILE volatile #define _GLIBCXX_VOLATILE volatile
namespace __gnu_parallel namespace __gnu_parallel
{ {
/** @brief Parallel implementation of std::partition. /** @brief Parallel implementation of std::partition.
* @param begin Begin iterator of input sequence to split. * @param begin Begin iterator of input sequence to split.
* @param end End iterator of input sequence to split. * @param end End iterator of input sequence to split.
* @param pred Partition predicate, possibly including some kind of pivot. * @param pred Partition predicate, possibly including some kind of pivot.
* @param max_num_threads Maximum number of threads to use for this task. * @param num_threads Maximum number of threads to use for this task.
* @return Number of elements not fulfilling the predicate. */ * @return Number of elements not fulfilling the predicate. */
template<typename RandomAccessIterator, typename Predicate> template<typename RandomAccessIterator, typename Predicate>
inline typename std::iterator_traits<RandomAccessIterator>::difference_type typename std::iterator_traits<RandomAccessIterator>::difference_type
parallel_partition(RandomAccessIterator begin, RandomAccessIterator end, parallel_partition(RandomAccessIterator begin, RandomAccessIterator end,
Predicate pred, thread_index_t max_num_threads) Predicate pred, thread_index_t num_threads)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
...@@ -74,25 +74,37 @@ namespace __gnu_parallel ...@@ -74,25 +74,37 @@ namespace __gnu_parallel
_GLIBCXX_VOLATILE difference_type leftover_left, leftover_right; _GLIBCXX_VOLATILE difference_type leftover_left, leftover_right;
_GLIBCXX_VOLATILE difference_type leftnew, rightnew; _GLIBCXX_VOLATILE difference_type leftnew, rightnew;
bool* reserved_left, * reserved_right; bool* reserved_left = NULL, * reserved_right = NULL;
reserved_left = new bool[max_num_threads];
reserved_right = new bool[max_num_threads];
difference_type chunk_size; difference_type chunk_size;
if (Settings::partition_chunk_share > 0.0)
chunk_size = std::max((difference_type)Settings::partition_chunk_size, (difference_type)((double)n * Settings::partition_chunk_share / (double)max_num_threads));
else
chunk_size = Settings::partition_chunk_size;
omp_lock_t result_lock; omp_lock_t result_lock;
omp_init_lock(&result_lock); omp_init_lock(&result_lock);
// At least good for two processors. //at least two chunks per thread
while (right - left + 1 >= 2 * max_num_threads * chunk_size) if(right - left + 1 >= 2 * num_threads * chunk_size)
# pragma omp parallel num_threads(num_threads)
{
# pragma omp single
{
num_threads = omp_get_num_threads();
reserved_left = new bool[num_threads];
reserved_right = new bool[num_threads];
if (Settings::partition_chunk_share > 0.0)
chunk_size = std::max<difference_type>(
Settings::partition_chunk_size,
(double)n * Settings::partition_chunk_share /
(double)num_threads);
else
chunk_size = Settings::partition_chunk_size;
}
while (right - left + 1 >= 2 * num_threads * chunk_size)
{
# pragma omp single
{ {
difference_type num_chunks = (right - left + 1) / chunk_size; difference_type num_chunks = (right - left + 1) / chunk_size;
thread_index_t num_threads = (int)std::min((difference_type)max_num_threads, num_chunks / 2);
for (int r = 0; r < num_threads; r++) for (int r = 0; r < num_threads; r++)
{ {
...@@ -101,11 +113,11 @@ namespace __gnu_parallel ...@@ -101,11 +113,11 @@ namespace __gnu_parallel
} }
leftover_left = 0; leftover_left = 0;
leftover_right = 0; leftover_right = 0;
} //implicit barrier
#pragma omp parallel num_threads(num_threads)
{
// Private. // Private.
difference_type thread_left, thread_left_border, thread_right, thread_right_border; difference_type thread_left, thread_left_border,
thread_right, thread_right_border;
thread_left = left + 1; thread_left = left + 1;
// Just to satisfy the condition below. // Just to satisfy the condition below.
...@@ -150,12 +162,15 @@ namespace __gnu_parallel ...@@ -150,12 +162,15 @@ namespace __gnu_parallel
// Swap as usual. // Swap as usual.
while (thread_left < thread_right) while (thread_left < thread_right)
{ {
while (pred(begin[thread_left]) && thread_left <= thread_left_border) while (pred(begin[thread_left])
&& thread_left <= thread_left_border)
thread_left++; thread_left++;
while (!pred(begin[thread_right]) && thread_right >= thread_right_border) while (!pred(begin[thread_right])
&& thread_right >= thread_right_border)
thread_right--; thread_right--;
if (thread_left > thread_left_border || thread_right < thread_right_border) if (thread_left > thread_left_border
|| thread_right < thread_right_border)
// Fetch new chunk(s). // Fetch new chunk(s).
break; break;
...@@ -167,28 +182,29 @@ namespace __gnu_parallel ...@@ -167,28 +182,29 @@ namespace __gnu_parallel
// Now swap the leftover chunks to the right places. // Now swap the leftover chunks to the right places.
if (thread_left <= thread_left_border) if (thread_left <= thread_left_border)
#pragma omp atomic # pragma omp atomic
leftover_left++; leftover_left++;
if (thread_right >= thread_right_border) if (thread_right >= thread_right_border)
#pragma omp atomic # pragma omp atomic
leftover_right++; leftover_right++;
#pragma omp barrier # pragma omp barrier
#pragma omp single # pragma omp single
{ {
leftnew = left - leftover_left * chunk_size; leftnew = left - leftover_left * chunk_size;
rightnew = right + leftover_right * chunk_size; rightnew = right + leftover_right * chunk_size;
} }
#pragma omp barrier # pragma omp barrier
// <=> thread_left_border + (chunk_size - 1) >= leftnew // <=> thread_left_border + (chunk_size - 1) >= leftnew
if (thread_left <= thread_left_border if (thread_left <= thread_left_border
&& thread_left_border >= leftnew) && thread_left_border >= leftnew)
{ {
// Chunk already in place, reserve spot. // Chunk already in place, reserve spot.
reserved_left[(left - (thread_left_border + 1)) / chunk_size] = true; reserved_left[(left - (thread_left_border + 1)) / chunk_size]
= true;
} }
// <=> thread_right_border - (chunk_size - 1) <= rightnew // <=> thread_right_border - (chunk_size - 1) <= rightnew
...@@ -196,12 +212,15 @@ namespace __gnu_parallel ...@@ -196,12 +212,15 @@ namespace __gnu_parallel
&& thread_right_border <= rightnew) && thread_right_border <= rightnew)
{ {
// Chunk already in place, reserve spot. // Chunk already in place, reserve spot.
reserved_right[((thread_right_border - 1) - right) / chunk_size] = true; reserved_right
[((thread_right_border - 1) - right) / chunk_size]
= true;
} }
#pragma omp barrier # pragma omp barrier
if (thread_left <= thread_left_border && thread_left_border < leftnew) if (thread_left <= thread_left_border
&& thread_left_border < leftnew)
{ {
// Find spot and swap. // Find spot and swap.
difference_type swapstart = -1; difference_type swapstart = -1;
...@@ -219,7 +238,10 @@ namespace __gnu_parallel ...@@ -219,7 +238,10 @@ namespace __gnu_parallel
_GLIBCXX_PARALLEL_ASSERT(swapstart != -1); _GLIBCXX_PARALLEL_ASSERT(swapstart != -1);
#endif #endif
std::swap_ranges(begin + thread_left_border - (chunk_size - 1), begin + thread_left_border + 1, begin + swapstart); std::swap_ranges(
begin + thread_left_border - (chunk_size - 1),
begin + thread_left_border + 1,
begin + swapstart);
} }
if (thread_right >= thread_right_border if (thread_right >= thread_right_border
...@@ -241,12 +263,14 @@ namespace __gnu_parallel ...@@ -241,12 +263,14 @@ namespace __gnu_parallel
_GLIBCXX_PARALLEL_ASSERT(swapstart != -1); _GLIBCXX_PARALLEL_ASSERT(swapstart != -1);
#endif #endif
std::swap_ranges(begin + thread_right_border, begin + thread_right_border + chunk_size, begin + swapstart); std::swap_ranges(begin + thread_right_border,
begin + thread_right_border + chunk_size,
begin + swapstart);
} }
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
#pragma omp barrier # pragma omp barrier
#pragma omp single # pragma omp single
{ {
for (int r = 0; r < leftover_left; r++) for (int r = 0; r < leftover_left; r++)
_GLIBCXX_PARALLEL_ASSERT(reserved_left[r]); _GLIBCXX_PARALLEL_ASSERT(reserved_left[r]);
...@@ -254,14 +278,16 @@ namespace __gnu_parallel ...@@ -254,14 +278,16 @@ namespace __gnu_parallel
_GLIBCXX_PARALLEL_ASSERT(reserved_right[r]); _GLIBCXX_PARALLEL_ASSERT(reserved_right[r]);
} }
#pragma omp barrier # pragma omp barrier
#endif #endif
#pragma omp barrier # pragma omp barrier
left = leftnew; left = leftnew;
right = rightnew; right = rightnew;
} }
} // end "recursion" # pragma omp flush(left, right)
} // end "recursion" //parallel
difference_type final_left = left, final_right = right; difference_type final_left = left, final_right = right;
...@@ -298,14 +324,14 @@ namespace __gnu_parallel ...@@ -298,14 +324,14 @@ namespace __gnu_parallel
return final_left + 1; return final_left + 1;
} }
/** /**
* @brief Parallel implementation of std::nth_element(). * @brief Parallel implementation of std::nth_element().
* @param begin Begin iterator of input sequence. * @param begin Begin iterator of input sequence.
* @param nth Iterator of element that must be in position afterwards. * @param nth Iterator of element that must be in position afterwards.
* @param end End iterator of input sequence. * @param end End iterator of input sequence.
* @param comp Comparator. * @param comp Comparator.
*/ */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
void void
parallel_nth_element(RandomAccessIterator begin, RandomAccessIterator nth, parallel_nth_element(RandomAccessIterator begin, RandomAccessIterator nth,
RandomAccessIterator end, Comparator comp) RandomAccessIterator end, Comparator comp)
...@@ -377,12 +403,12 @@ namespace __gnu_parallel ...@@ -377,12 +403,12 @@ namespace __gnu_parallel
__gnu_sequential::sort(begin, end, comp); __gnu_sequential::sort(begin, end, comp);
} }
/** @brief Parallel implementation of std::partial_sort(). /** @brief Parallel implementation of std::partial_sort().
* @param begin Begin iterator of input sequence. * @param begin Begin iterator of input sequence.
* @param middle Sort until this position. * @param middle Sort until this position.
* @param end End iterator of input sequence. * @param end End iterator of input sequence.
* @param comp Comparator. */ * @param comp Comparator. */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
void void
parallel_partial_sort(RandomAccessIterator begin, RandomAccessIterator middle, RandomAccessIterator end, Comparator comp) parallel_partial_sort(RandomAccessIterator begin, RandomAccessIterator middle, RandomAccessIterator end, Comparator comp)
{ {
......
...@@ -53,11 +53,17 @@ namespace __gnu_parallel ...@@ -53,11 +53,17 @@ namespace __gnu_parallel
* this part. * this part.
*/ */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
inline typename std::iterator_traits<RandomAccessIterator>::difference_type inline
parallel_sort_qs_divide(RandomAccessIterator begin, RandomAccessIterator end, typename std::iterator_traits<RandomAccessIterator>::difference_type
parallel_sort_qs_divide(
RandomAccessIterator begin,
RandomAccessIterator end,
Comparator comp, Comparator comp,
typename std::iterator_traits<RandomAccessIterator>::difference_type pivot_rank, typename std::iterator_traits<RandomAccessIterator>::difference_type
typename std::iterator_traits<RandomAccessIterator>::difference_type num_samples, thread_index_t num_threads) pivot_rank,
typename std::iterator_traits<RandomAccessIterator>::difference_type
num_samples,
thread_index_t num_threads)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
...@@ -65,20 +71,24 @@ namespace __gnu_parallel ...@@ -65,20 +71,24 @@ namespace __gnu_parallel
difference_type n = end - begin; difference_type n = end - begin;
num_samples = std::min(num_samples, n); num_samples = std::min(num_samples, n);
value_type* samples = static_cast<value_type*>(__builtin_alloca(sizeof(value_type) * num_samples));
// Allocate uninitialized, to avoid default constructor.
value_type* samples = static_cast<value_type*>(
operator new(num_samples * sizeof(value_type)));
for (difference_type s = 0; s < num_samples; s++) for (difference_type s = 0; s < num_samples; s++)
{ {
const unsigned long long index = static_cast<unsigned long long>(s) const unsigned long long index = static_cast<unsigned long long>(s)
* n / num_samples; * n / num_samples;
samples[s] = begin[index]; new(samples + s) value_type(begin[index]);
} }
__gnu_sequential::sort(samples, samples + num_samples, comp); __gnu_sequential::sort(samples, samples + num_samples, comp);
value_type& pivot = samples[pivot_rank * num_samples / n]; value_type& pivot = samples[pivot_rank * num_samples / n];
__gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, pivot); __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool>
pred(comp, pivot);
difference_type split = parallel_partition(begin, end, pred, num_threads); difference_type split = parallel_partition(begin, end, pred, num_threads);
return split; return split;
...@@ -93,7 +103,10 @@ namespace __gnu_parallel ...@@ -93,7 +103,10 @@ namespace __gnu_parallel
*/ */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
inline void inline void
parallel_sort_qs_conquer(RandomAccessIterator begin, RandomAccessIterator end, Comparator comp, int num_threads) parallel_sort_qs_conquer(RandomAccessIterator begin,
RandomAccessIterator end,
Comparator comp,
thread_index_t num_threads)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
...@@ -110,24 +123,27 @@ namespace __gnu_parallel ...@@ -110,24 +123,27 @@ namespace __gnu_parallel
if (n <= 1) if (n <= 1)
return; return;
thread_index_t num_processors_left; thread_index_t num_threads_left;
if ((num_threads % 2) == 1) if ((num_threads % 2) == 1)
num_processors_left = num_threads / 2 + 1; num_threads_left = num_threads / 2 + 1;
else else
num_processors_left = num_threads / 2; num_threads_left = num_threads / 2;
pivot_rank = n * num_processors_left / num_threads; pivot_rank = n * num_threads_left / num_threads;
difference_type split = parallel_sort_qs_divide(begin, end, comp, pivot_rank, difference_type split = parallel_sort_qs_divide(
Settings::sort_qs_num_samples_preset, num_threads); begin, end, comp, pivot_rank,
Settings::sort_qs_num_samples_preset, num_threads);
#pragma omp parallel sections #pragma omp parallel sections
{ {
#pragma omp section #pragma omp section
parallel_sort_qs_conquer(begin, begin + split, comp, num_processors_left); parallel_sort_qs_conquer(begin, begin + split,
comp, num_threads_left);
#pragma omp section #pragma omp section
parallel_sort_qs_conquer(begin + split, end, comp, num_threads - num_processors_left); parallel_sort_qs_conquer(begin + split, end,
comp, num_threads - num_threads_left);
} }
} }
...@@ -143,9 +159,12 @@ Settings::sort_qs_num_samples_preset, num_threads); ...@@ -143,9 +159,12 @@ Settings::sort_qs_num_samples_preset, num_threads);
*/ */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
inline void inline void
parallel_sort_qs(RandomAccessIterator begin, RandomAccessIterator end, parallel_sort_qs(
RandomAccessIterator begin,
RandomAccessIterator end,
Comparator comp, Comparator comp,
typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads) typename std::iterator_traits<RandomAccessIterator>::difference_type n,
int num_threads)
{ {
_GLIBCXX_CALL(n) _GLIBCXX_CALL(n)
...@@ -165,10 +184,7 @@ Settings::sort_qs_num_samples_preset, num_threads); ...@@ -165,10 +184,7 @@ Settings::sort_qs_num_samples_preset, num_threads);
// Hard to avoid. // Hard to avoid.
omp_set_num_threads(num_threads); omp_set_num_threads(num_threads);
bool old_nested = (omp_get_nested() != 0);
omp_set_nested(true);
parallel_sort_qs_conquer(begin, begin + n, comp, num_threads); parallel_sort_qs_conquer(begin, begin + n, comp, num_threads);
omp_set_nested(old_nested);
} }
} //namespace __gnu_parallel } //namespace __gnu_parallel
......
...@@ -53,7 +53,7 @@ namespace __gnu_parallel ...@@ -53,7 +53,7 @@ namespace __gnu_parallel
* @param length Length of sequence to search for. * @param length Length of sequence to search for.
* @param advances Returned offsets. * @param advances Returned offsets.
*/ */
template<typename RandomAccessIterator, typename _DifferenceTp> template<typename RandomAccessIterator, typename _DifferenceTp>
void void
calc_borders(RandomAccessIterator elements, _DifferenceTp length, calc_borders(RandomAccessIterator elements, _DifferenceTp length,
_DifferenceTp* off) _DifferenceTp* off)
...@@ -81,7 +81,10 @@ namespace __gnu_parallel ...@@ -81,7 +81,10 @@ namespace __gnu_parallel
* @param end2 End iterator of second sequence. * @param end2 End iterator of second sequence.
* @param pred Find predicate. * @param pred Find predicate.
* @return Place of finding in first sequences. */ * @return Place of finding in first sequences. */
template<typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename Pred> template<
typename _RandomAccessIterator1,
typename _RandomAccessIterator2,
typename Pred>
_RandomAccessIterator1 _RandomAccessIterator1
search_template(_RandomAccessIterator1 begin1, _RandomAccessIterator1 end1, search_template(_RandomAccessIterator1 begin1, _RandomAccessIterator1 end1,
_RandomAccessIterator2 begin2, _RandomAccessIterator2 end2, _RandomAccessIterator2 begin2, _RandomAccessIterator2 end2,
...@@ -103,27 +106,34 @@ namespace __gnu_parallel ...@@ -103,27 +106,34 @@ namespace __gnu_parallel
// Where is first occurrence of pattern? defaults to end. // Where is first occurrence of pattern? defaults to end.
difference_type result = (end1 - begin1); difference_type result = (end1 - begin1);
difference_type *splitters;
// Pattern too long. // Pattern too long.
if (input_length < 0) if (input_length < 0)
return end1; return end1;
thread_index_t num_threads = std::max<difference_type>(1, std::min<difference_type>(input_length, __gnu_parallel::get_max_threads()));
omp_lock_t result_lock; omp_lock_t result_lock;
omp_init_lock(&result_lock); omp_init_lock(&result_lock);
difference_type borders[num_threads + 1]; thread_index_t num_threads =
__gnu_parallel::equally_split(input_length, num_threads, borders); std::max<difference_type>(1,
std::min<difference_type>(input_length, get_max_threads()));
difference_type advances[pattern_length]; difference_type advances[pattern_length];
calc_borders(begin2, pattern_length, advances); calc_borders(begin2, pattern_length, advances);
#pragma omp parallel num_threads(num_threads) # pragma omp parallel num_threads(num_threads)
{
# pragma omp single
{ {
num_threads = omp_get_num_threads();
splitters = new difference_type[num_threads + 1];
equally_split(input_length, num_threads, splitters);
}
thread_index_t iam = omp_get_thread_num(); thread_index_t iam = omp_get_thread_num();
difference_type start = borders[iam], stop = borders[iam + 1]; difference_type start = splitters[iam], stop = splitters[iam + 1];
difference_type pos_in_pattern = 0; difference_type pos_in_pattern = 0;
bool found_pattern = false; bool found_pattern = false;
...@@ -131,11 +141,12 @@ namespace __gnu_parallel ...@@ -131,11 +141,12 @@ namespace __gnu_parallel
while (start <= stop && !found_pattern) while (start <= stop && !found_pattern)
{ {
// Get new value of result. // Get new value of result.
#pragma omp flush(result) #pragma omp flush(result)
// No chance for this thread to find first occurrence. // No chance for this thread to find first occurrence.
if (result < start) if (result < start)
break; break;
while (pred(begin1[start + pos_in_pattern], begin2[pos_in_pattern])) while (pred(begin1[start + pos_in_pattern],
begin2[pos_in_pattern]))
{ {
++pos_in_pattern; ++pos_in_pattern;
if (pos_in_pattern == pattern_length) if (pos_in_pattern == pattern_length)
...@@ -151,12 +162,15 @@ namespace __gnu_parallel ...@@ -151,12 +162,15 @@ namespace __gnu_parallel
} }
// Make safe jump. // Make safe jump.
start += (pos_in_pattern - advances[pos_in_pattern]); start += (pos_in_pattern - advances[pos_in_pattern]);
pos_in_pattern = (advances[pos_in_pattern] < 0) ? 0 : advances[pos_in_pattern]; pos_in_pattern =
} (advances[pos_in_pattern] < 0) ? 0 : advances[pos_in_pattern];
} }
} //parallel
omp_destroy_lock(&result_lock); omp_destroy_lock(&result_lock);
delete[] splitters;
// Return iterator on found element. // Return iterator on found element.
return (begin1 + result); return (begin1 + result);
} }
......
...@@ -44,13 +44,16 @@ ...@@ -44,13 +44,16 @@
namespace __gnu_parallel namespace __gnu_parallel
{ {
/** @brief Parallel std::unique_copy(), without explicit equality predicate. /** @brief Parallel std::unique_copy(), w/o explicit equality predicate.
* @param first Begin iterator of input sequence. * @param first Begin iterator of input sequence.
* @param last End iterator of input sequence. * @param last End iterator of input sequence.
* @param result Begin iterator of result sequence. * @param result Begin iterator of result sequence.
* @param binary_pred Equality predicate. * @param binary_pred Equality predicate.
* @return End iterator of result sequence. */ * @return End iterator of result sequence. */
template<typename InputIterator, class OutputIterator, class BinaryPredicate> template<
typename InputIterator,
class OutputIterator,
class BinaryPredicate>
inline OutputIterator inline OutputIterator
parallel_unique_copy(InputIterator first, InputIterator last, parallel_unique_copy(InputIterator first, InputIterator last,
OutputIterator result, BinaryPredicate binary_pred) OutputIterator result, BinaryPredicate binary_pred)
...@@ -62,20 +65,27 @@ namespace __gnu_parallel ...@@ -62,20 +65,27 @@ namespace __gnu_parallel
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
difference_type size = last - first; difference_type size = last - first;
int num_threads = __gnu_parallel::get_max_threads();
difference_type counter[num_threads + 1];
if (size == 0) if (size == 0)
return result; return result;
// Let the first thread process two parts. // Let the first thread process two parts.
difference_type borders[num_threads + 2]; difference_type *counter;
__gnu_parallel::equally_split(size, num_threads + 1, borders); difference_type *borders;
thread_index_t num_threads = get_max_threads();
// First part contains at least one element. // First part contains at least one element.
#pragma omp parallel num_threads(num_threads) # pragma omp parallel num_threads(num_threads)
{ {
int iam = omp_get_thread_num(); # pragma omp single
{
num_threads = omp_get_num_threads();
borders = new difference_type[num_threads + 2];
equally_split(size, num_threads + 1, borders);
counter = new difference_type[num_threads + 1];
}
thread_index_t iam = omp_get_thread_num();
difference_type begin, end; difference_type begin, end;
...@@ -83,6 +93,7 @@ namespace __gnu_parallel ...@@ -83,6 +93,7 @@ namespace __gnu_parallel
// Needed for position in output // Needed for position in output
difference_type i = 0; difference_type i = 0;
OutputIterator out = result; OutputIterator out = result;
if (iam == 0) if (iam == 0)
{ {
begin = borders[0] + 1; // == 1 begin = borders[0] + 1; // == 1
...@@ -120,7 +131,7 @@ namespace __gnu_parallel ...@@ -120,7 +131,7 @@ namespace __gnu_parallel
// Last part still untouched. // Last part still untouched.
difference_type begin_output; difference_type begin_output;
#pragma omp barrier # pragma omp barrier
// Store result in output on calculated positions. // Store result in output on calculated positions.
begin_output = 0; begin_output = 0;
...@@ -170,15 +181,17 @@ namespace __gnu_parallel ...@@ -170,15 +181,17 @@ namespace __gnu_parallel
for (int t = 0; t < num_threads + 1; t++) for (int t = 0; t < num_threads + 1; t++)
end_output += counter[t]; end_output += counter[t];
delete[] borders;
return result + end_output; return result + end_output;
} }
/** @brief Parallel std::unique_copy(), without explicit equality predicate /** @brief Parallel std::unique_copy(), without explicit equality predicate
* @param first Begin iterator of input sequence. * @param first Begin iterator of input sequence.
* @param last End iterator of input sequence. * @param last End iterator of input sequence.
* @param result Begin iterator of result sequence. * @param result Begin iterator of result sequence.
* @return End iterator of result sequence. */ * @return End iterator of result sequence. */
template<typename InputIterator, class OutputIterator> template<typename InputIterator, class OutputIterator>
inline OutputIterator inline OutputIterator
parallel_unique_copy(InputIterator first, InputIterator last, parallel_unique_copy(InputIterator first, InputIterator last,
OutputIterator result) OutputIterator result)
......
...@@ -55,8 +55,8 @@ namespace __gnu_parallel ...@@ -55,8 +55,8 @@ namespace __gnu_parallel
#define _GLIBCXX_JOB_VOLATILE volatile #define _GLIBCXX_JOB_VOLATILE volatile
/** @brief One job for a certain thread. */ /** @brief One job for a certain thread. */
template<typename _DifferenceTp> template<typename _DifferenceTp>
struct Job struct Job
{ {
typedef _DifferenceTp difference_type; typedef _DifferenceTp difference_type;
...@@ -78,7 +78,7 @@ namespace __gnu_parallel ...@@ -78,7 +78,7 @@ namespace __gnu_parallel
_GLIBCXX_JOB_VOLATILE difference_type load; _GLIBCXX_JOB_VOLATILE difference_type load;
}; };
/** @brief Work stealing algorithm for random access iterators. /** @brief Work stealing algorithm for random access iterators.
* *
* Uses O(1) additional memory. Synchronization at job lists is * Uses O(1) additional memory. Synchronization at job lists is
* done with atomic operations. * done with atomic operations.
...@@ -96,13 +96,20 @@ namespace __gnu_parallel ...@@ -96,13 +96,20 @@ namespace __gnu_parallel
* std::count_n()). * std::count_n()).
* @return User-supplied functor (that may contain a part of the result). * @return User-supplied functor (that may contain a part of the result).
*/ */
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result> template<
typename RandomAccessIterator,
typename Op,
typename Fu,
typename Red,
typename Result>
Op Op
for_each_template_random_access_workstealing(RandomAccessIterator begin, for_each_template_random_access_workstealing(
RandomAccessIterator begin,
RandomAccessIterator end, RandomAccessIterator end,
Op op, Fu& f, Red r, Op op, Fu& f, Red r,
Result base, Result& output, Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::difference_type bound) typename std::iterator_traits<RandomAccessIterator>::difference_type
bound)
{ {
_GLIBCXX_CALL(end - begin) _GLIBCXX_CALL(end - begin)
...@@ -110,34 +117,43 @@ namespace __gnu_parallel ...@@ -110,34 +117,43 @@ namespace __gnu_parallel
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
difference_type chunk_size = static_cast<difference_type>(Settings::workstealing_chunk_size); difference_type chunk_size =
static_cast<difference_type>(Settings::workstealing_chunk_size);
// How many jobs? // How many jobs?
difference_type length = (bound < 0) ? (end - begin) : bound; difference_type length = (bound < 0) ? (end - begin) : bound;
// To avoid false sharing in a cache line. // To avoid false sharing in a cache line.
const int stride = Settings::cache_line_size * 10 / sizeof(Job<difference_type>) + 1; const int stride =
Settings::cache_line_size * 10 / sizeof(Job<difference_type>) + 1;
// Total number of threads currently working. // Total number of threads currently working.
thread_index_t busy = 0; thread_index_t busy = 0;
thread_index_t num_threads = get_max_threads();
difference_type num_threads_min = num_threads < end - begin ? num_threads : end - begin; Job<difference_type> *job;
omp_lock_t output_lock; omp_lock_t output_lock;
omp_init_lock(&output_lock); omp_init_lock(&output_lock);
// No more threads than jobs, at least one thread.
difference_type num_threads_max = num_threads_min > 1 ? num_threads_min : 1;
num_threads = static_cast<thread_index_t>(num_threads_max);
// Create job description array.
Job<difference_type> *job = new Job<difference_type>[num_threads * stride];
// Write base value to output. // Write base value to output.
output = base; output = base;
#pragma omp parallel shared(busy) num_threads(num_threads) // No more threads than jobs, at least one thread.
thread_index_t num_threads =
__gnu_parallel::max<thread_index_t>(1,
__gnu_parallel::min<difference_type>(length, get_max_threads()));
# pragma omp parallel shared(busy) num_threads(num_threads)
{ {
# pragma omp single
{
num_threads = omp_get_num_threads();
// Create job description array.
job = new Job<difference_type>[num_threads * stride];
}
// Initialization phase. // Initialization phase.
// Flags for every thread if it is doing productive work. // Flags for every thread if it is doing productive work.
...@@ -158,19 +174,22 @@ namespace __gnu_parallel ...@@ -158,19 +174,22 @@ namespace __gnu_parallel
// Number of elements to steal in one attempt. // Number of elements to steal in one attempt.
difference_type steal; difference_type steal;
// Every thread has its own random number generator (modulo num_threads). // Every thread has its own random number generator
// (modulo num_threads).
random_number rand_gen(iam, num_threads); random_number rand_gen(iam, num_threads);
#pragma omp atomic
// This thread is currently working. // This thread is currently working.
# pragma omp atomic
busy++; busy++;
iam_working = true; iam_working = true;
// How many jobs per thread? last thread gets the rest. // How many jobs per thread? last thread gets the rest.
my_job.first = static_cast<difference_type>(iam * (length / num_threads)); my_job.first =
static_cast<difference_type>(iam * (length / num_threads));
my_job.last = (iam == (num_threads - 1)) ? (length - 1) : ((iam + 1) * (length / num_threads) - 1); my_job.last = (iam == (num_threads - 1)) ?
(length - 1) : ((iam + 1) * (length / num_threads) - 1);
my_job.load = my_job.last - my_job.first + 1; my_job.load = my_job.last - my_job.first + 1;
// Init result with first value (to have a base value for reduction). // Init result with first value (to have a base value for reduction).
...@@ -185,26 +204,29 @@ namespace __gnu_parallel ...@@ -185,26 +204,29 @@ namespace __gnu_parallel
RandomAccessIterator current; RandomAccessIterator current;
#pragma omp barrier # pragma omp barrier
// Actual work phase // Actual work phase
// Work on own or stolen start // Work on own or stolen start
while (busy > 0) while (busy > 0)
{ {
// Work until no productive thread left. // Work until no productive thread left.
#pragma omp flush(busy) # pragma omp flush(busy)
// Thread has own work to do // Thread has own work to do
while (my_job.first <= my_job.last) while (my_job.first <= my_job.last)
{ {
// fetch-and-add call // fetch-and-add call
// Reserve current job block (size chunk_size) in my queue. // Reserve current job block (size chunk_size) in my queue.
difference_type current_job = fetch_and_add<difference_type>(&(my_job.first), chunk_size); difference_type current_job =
fetch_and_add<difference_type>(&(my_job.first), chunk_size);
// Update load, to make the three values consistent, // Update load, to make the three values consistent,
// first might have been changed in the meantime // first might have been changed in the meantime
my_job.load = my_job.last - my_job.first + 1; my_job.load = my_job.last - my_job.first + 1;
for (difference_type job_counter = 0; job_counter < chunk_size && current_job <= my_job.last; job_counter++) for (difference_type job_counter = 0;
job_counter < chunk_size && current_job <= my_job.last;
job_counter++)
{ {
// Yes: process it! // Yes: process it!
current = begin + current_job; current = begin + current_job;
...@@ -214,15 +236,14 @@ namespace __gnu_parallel ...@@ -214,15 +236,14 @@ namespace __gnu_parallel
result = r(result, f(op, current)); result = r(result, f(op, current));
} }
#pragma omp flush(busy) # pragma omp flush(busy)
} }
// After reaching this point, a thread's job list is empty. // After reaching this point, a thread's job list is empty.
if (iam_working) if (iam_working)
{ {
#pragma omp atomic
// This thread no longer has work. // This thread no longer has work.
# pragma omp atomic
busy--; busy--;
iam_working = false; iam_working = false;
...@@ -231,16 +252,17 @@ namespace __gnu_parallel ...@@ -231,16 +252,17 @@ namespace __gnu_parallel
difference_type supposed_first, supposed_last, supposed_load; difference_type supposed_first, supposed_last, supposed_load;
do do
{ {
// Find random nonempty deque (not own) and do consistency check. // Find random nonempty deque (not own), do consistency check.
yield(); yield();
#pragma omp flush(busy) # pragma omp flush(busy)
victim = rand_gen(); victim = rand_gen();
supposed_first = job[victim * stride].first; supposed_first = job[victim * stride].first;
supposed_last = job[victim * stride].last; supposed_last = job[victim * stride].last;
supposed_load = job[victim * stride].load; supposed_load = job[victim * stride].load;
} }
while (busy > 0 while (busy > 0
&& ((supposed_load <= 0) || ((supposed_first + supposed_load - 1) != supposed_last))); && ((supposed_load <= 0)
|| ((supposed_first + supposed_load - 1) != supposed_last)));
if (busy == 0) if (busy == 0)
break; break;
...@@ -251,40 +273,30 @@ namespace __gnu_parallel ...@@ -251,40 +273,30 @@ namespace __gnu_parallel
// Number of elements to steal (at least one). // Number of elements to steal (at least one).
steal = (supposed_load < 2) ? 1 : supposed_load / 2; steal = (supposed_load < 2) ? 1 : supposed_load / 2;
// Protects against stealing threads
// omp_set_lock(&(job[victim * stride].lock));
// Push victim's start forward. // Push victim's start forward.
difference_type stolen_first = fetch_and_add<difference_type>(&(job[victim * stride].first), steal); difference_type stolen_first =
difference_type stolen_try = stolen_first + steal - difference_type(1); fetch_and_add<difference_type>(
&(job[victim * stride].first), steal);
// Protects against working thread difference_type stolen_try =
// omp_unset_lock(&(job[victim * stride].lock)); stolen_first + steal - difference_type(1);
my_job.first = stolen_first; my_job.first = stolen_first;
my_job.last = __gnu_parallel::min(stolen_try, supposed_last);
// Avoid std::min dependencies.
my_job.last = stolen_try < supposed_last ? stolen_try : supposed_last;
my_job.load = my_job.last - my_job.first + 1; my_job.load = my_job.last - my_job.first + 1;
//omp_unset_lock(&(my_job.lock));
#pragma omp atomic
// Has potential work again. // Has potential work again.
# pragma omp atomic
busy++; busy++;
iam_working = true; iam_working = true;
#pragma omp flush(busy) # pragma omp flush(busy)
} }
#pragma omp flush(busy) # pragma omp flush(busy)
} // end while busy > 0 } // end while busy > 0
// Add accumulated result to output. // Add accumulated result to output.
omp_set_lock(&output_lock); omp_set_lock(&output_lock);
output = r(output, result); output = r(output, result);
omp_unset_lock(&output_lock); omp_unset_lock(&output_lock);
//omp_destroy_lock(&(my_job.lock));
} }
delete[] job; delete[] job;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment