Commit e683ee2a by Johannes Singler Committed by Johannes Singler

re PR libstdc++/33893 ([parallel mode] Algorithms rely on omp_set_dynamic(false))

2007-11-22  Johannes Singler  <singler@ira.uka.de>

        PR libstdc++/33893
        * include/parallel/multiway_merge.h: made omp_dynamic-safe
        * include/parallel/workstealing.h: made omp_dynamic-safe
        * include/parallel/base.h: infrastructure, cleanup
        * include/parallel/par_loop.h: made omp_dynamic-safe
        * include/parallel/features.h: activate loser tree variant
        * include/parallel/quicksort.h: made omp_dynamic-safe
        * include/parallel/compiletime_settings.h: settings overridable
        * include/parallel/equally_split.h: made omp_dynamic-safe
        * include/parallel/omp_loop_static.h: made omp_dynamic-safe
        * include/parallel/random_shuffle.h: made omp_dynamic-safe
        * include/parallel/balanced_quicksort.h: made omp_dynamic-safe
        * include/parallel/set_operations.h: made omp_dynamic-safe
        * include/parallel/unique_copy.h: made omp_dynamic-safe
        * include/parallel/multiway_mergesort.h: made omp_dynamic-safe
        * include/parallel/search.h: made omp_dynamic-safe
        * include/parallel/partition.h: made omp_dynamic-safe
        * include/parallel/partial_sum.h: made omp_dynamic-safe
        * include/parallel/find.h: made omp_dynamic-safe
        * include/parallel/omp_loop.h: made omp_dynamic-safe
        * include/parallel/losertree.h: avoid default constructor

From-SVN: r130347
parent 7861a5ce
2007-11-22 Johannes Singler <singler@ira.uka.de>
PR libstdc++/33893
* include/parallel/multiway_merge.h: made omp_dynamic-safe
* include/parallel/workstealing.h: made omp_dynamic-safe
* include/parallel/base.h: infrastructure, cleanup
* include/parallel/par_loop.h: made omp_dynamic-safe
* include/parallel/features.h: activate loser tree variant
* include/parallel/quicksort.h: made omp_dynamic-safe
* include/parallel/compiletime_settings.h: settings overridable
* include/parallel/equally_split.h: made omp_dynamic-safe
* include/parallel/omp_loop_static.h: made omp_dynamic-safe
* include/parallel/random_shuffle.h: made omp_dynamic-safe
* include/parallel/balanced_quicksort.h: made omp_dynamic-safe
* include/parallel/set_operations.h: made omp_dynamic-safe
* include/parallel/unique_copy.h: made omp_dynamic-safe
* include/parallel/multiway_mergesort.h: made omp_dynamic-safe
* include/parallel/search.h: made omp_dynamic-safe
* include/parallel/partition.h: made omp_dynamic-safe
* include/parallel/partial_sum.h: made omp_dynamic-safe
* include/parallel/find.h: made omp_dynamic-safe
* include/parallel/omp_loop.h: made omp_dynamic-safe
* include/parallel/losertree.h: avoid default constructor
2007-11-21 Jonathan Wakely <jwakely.gcc@gmail.com>
* docs/html/17_intro/C++STYLE: Fix typos.
......
......@@ -39,7 +39,7 @@
#include <cstdio>
/** @brief Determine verbosity level of the parallel mode.
* Level 1 prints a message each time when entering a parallel-mode function. */
* Level 1 prints a message each time a parallel-mode function is entered. */
#define _GLIBCXX_VERBOSE_LEVEL 0
/** @def _GLIBCXX_CALL
......@@ -50,27 +50,40 @@
#define _GLIBCXX_CALL(n)
#endif
#if (_GLIBCXX_VERBOSE_LEVEL == 1)
#define _GLIBCXX_CALL(n) printf(" %s:\niam = %d, n = %ld, num_threads = %d\n", __PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads());
#define _GLIBCXX_CALL(n) \
printf(" %s:\niam = %d, n = %ld, num_threads = %d\n", \
__PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads());
#endif
#ifndef _GLIBCXX_SCALE_DOWN_FPU
/** @brief Use floating-point scaling instead of modulo for mapping
* random numbers to a range. This can be faster on certain CPUs. */
#define _GLIBCXX_SCALE_DOWN_FPU 0
#endif
#ifndef _GLIBCXX_ASSERTIONS
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
* Should be switched on only locally. */
#define _GLIBCXX_ASSERTIONS 0
#endif
#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
* Consider the size of the L1 cache for __gnu_parallel::parallel_random_shuffle(). */
* Consider the size of the L1 cache for
* __gnu_parallel::parallel_random_shuffle(). */
#define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 0
#endif
#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
* Consider the size of the TLB for __gnu_parallel::parallel_random_shuffle(). */
* Consider the size of the TLB for
* __gnu_parallel::parallel_random_shuffle(). */
#define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 0
#endif
#ifndef _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
/** @brief First copy the data, sort it locally, and merge it back
* (0); or copy it back after everything is done (1).
*
* Recommendation: 0 */
#define _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST 0
#endif
......@@ -39,30 +39,58 @@
namespace __gnu_parallel
{
/** @brief Function to split a sequence into parts of almost equal size.
/** @brief Function to split a sequence into parts of almost equal size.
*
* The resulting sequence s of length p+1 contains the splitting
* The resulting sequence s of length num_threads+1 contains the splitting
* positions when splitting the range [0,n) into parts of almost
* equal size (plus minus 1). The first entry is 0, the last one
* n. There may result empty parts.
* @param n Number of elements
* @param p Number of parts
* @param num_threads Number of parts
* @param s Splitters
* @returns End of splitter sequence, i. e. @c s+p+1 */
template<typename _DifferenceTp, typename OutputIterator>
* @returns End of splitter sequence, i. e. @c s+num_threads+1 */
template<typename difference_type, typename OutputIterator>
OutputIterator
equally_split(_DifferenceTp n, thread_index_t p, OutputIterator s)
equally_split(difference_type n,
thread_index_t num_threads,
OutputIterator s)
{
typedef _DifferenceTp difference_type;
difference_type chunk_length = n / p, split = n % p, start = 0;
for (int i = 0; i < p; i++)
difference_type chunk_length = n / num_threads,
num_longer_chunks = n % num_threads,
pos = 0;
for (thread_index_t i = 0; i < num_threads; ++i)
{
*s++ = start;
start += (difference_type(i) < split) ? (chunk_length + 1) : chunk_length;
*s++ = pos;
pos += (i < num_longer_chunks) ? (chunk_length + 1) : chunk_length;
}
*s++ = n;
return s;
}
/** @brief Function to split a sequence into parts of almost equal size.
*
* Returns the position of the splitting point between
* thread number thread_no (included) and
* thread number thread_no+1 (excluded).
* @param n Number of elements
* @param num_threads Number of parts
* @returns Splitting point */
template<typename difference_type>
difference_type
equally_split_point(difference_type n,
thread_index_t num_threads,
thread_index_t thread_no)
{
difference_type chunk_length = n / num_threads,
num_longer_chunks = n % num_threads;
if(thread_no < num_longer_chunks)
return thread_no * (chunk_length + 1);
else
return num_longer_chunks * (chunk_length + 1)
+ (thread_no - num_longer_chunks) * chunk_length;
}
}
#endif
......@@ -66,7 +66,7 @@
* @brief Include guarded (sequences may run empty) loser tree,
* moving objects.
* @see __gnu_parallel::Settings multiway_merge_algorithm */
#define _GLIBCXX_LOSER_TREE 0
#define _GLIBCXX_LOSER_TREE 1
#endif
#ifndef _GLIBCXX_LOSER_TREE_EXPLICIT
......
......@@ -43,10 +43,11 @@
#include <parallel/settings.h>
#include <parallel/basic_iterator.h>
#include <parallel/base.h>
namespace __gnu_parallel
{
/** @brief Embarrassingly parallel algorithm for random access
/** @brief Embarrassingly parallel algorithm for random access
* iterators, using an OpenMP for loop.
*
* @param begin Begin iterator of element sequence.
......@@ -63,34 +64,50 @@ namespace __gnu_parallel
* std::count_n()).
* @return User-supplied functor (that may contain a part of the result).
*/
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
template<typename RandomAccessIterator,
typename Op,
typename Fu,
typename Red,
typename Result>
Op
for_each_template_random_access_omp_loop(RandomAccessIterator begin, RandomAccessIterator end, Op o, Fu& f, Red r, Result base, Result& output, typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
for_each_template_random_access_omp_loop(
RandomAccessIterator begin,
RandomAccessIterator end,
Op o, Fu& f, Red r, Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::
difference_type bound)
{
typedef typename std::iterator_traits<RandomAccessIterator>::difference_type difference_type;
typedef typename
std::iterator_traits<RandomAccessIterator>::difference_type
difference_type;
thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : static_cast<thread_index_t>((end - begin));
Result *thread_results = new Result[num_threads];
difference_type length = end - begin;
thread_index_t num_threads =
__gnu_parallel::min<difference_type>(get_max_threads(), length);
for (thread_index_t i = 0; i < num_threads; i++)
{
thread_results[i] = r(thread_results[i], f(o, begin+i));
}
Result *thread_results;
#pragma omp parallel num_threads(num_threads)
# pragma omp parallel num_threads(num_threads)
{
#pragma omp for schedule(dynamic, Settings::workstealing_chunk_size)
for (difference_type pos = 0; pos < length; pos++)
# pragma omp single
{
thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos));
}
num_threads = omp_get_num_threads();
thread_results = new Result[num_threads];
for (thread_index_t i = 0; i < num_threads; i++)
thread_results[i] = Result();
}
thread_index_t iam = omp_get_thread_num();
# pragma omp for schedule(dynamic, Settings::workstealing_chunk_size)
for (difference_type pos = 0; pos < length; pos++)
thread_results[iam] =
r(thread_results[iam], f(o, begin+pos));
} //parallel
for (thread_index_t i = 0; i < num_threads; i++)
{
output = r(output, thread_results[i]);
}
delete [] thread_results;
......@@ -100,6 +117,7 @@ namespace __gnu_parallel
return o;
}
} // end namespace
#endif
......@@ -64,39 +64,50 @@ namespace __gnu_parallel
* std::count_n()).
* @return User-supplied functor (that may contain a part of the result).
*/
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
template<typename RandomAccessIterator,
typename Op,
typename Fu,
typename Red,
typename Result>
Op
for_each_template_random_access_omp_loop_static(RandomAccessIterator begin,
for_each_template_random_access_omp_loop_static(
RandomAccessIterator begin,
RandomAccessIterator end,
Op o, Fu& f, Red r,
Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
Op o, Fu& f, Red r, Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::
difference_type bound)
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::difference_type difference_type;
typedef typename
std::iterator_traits<RandomAccessIterator>::difference_type
difference_type;
thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : (end - begin);
Result *thread_results = new Result[num_threads];
difference_type length = end - begin;
thread_index_t num_threads =
std::min<difference_type>(get_max_threads(), length);
for (thread_index_t i = 0; i < num_threads; i++)
{
thread_results[i] = r(thread_results[i], f(o, begin+i));
}
Result *thread_results;
#pragma omp parallel num_threads(num_threads)
# pragma omp parallel num_threads(num_threads)
{
#pragma omp for schedule(static, Settings::workstealing_chunk_size)
for (difference_type pos = 0; pos < length; pos++)
# pragma omp single
{
thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos));
}
num_threads = omp_get_num_threads();
thread_results = new Result[num_threads];
for (thread_index_t i = 0; i < num_threads; i++)
thread_results[i] = Result();
}
thread_index_t iam = omp_get_thread_num();
# pragma omp for schedule(static, Settings::workstealing_chunk_size)
for (difference_type pos = 0; pos < length; pos++)
thread_results[iam] =
r(thread_results[iam], f(o, begin+pos));
} //parallel
for (thread_index_t i = 0; i < num_threads; i++)
{
output = r(output, thread_results[i]);
}
delete [] thread_results;
......@@ -106,6 +117,7 @@ namespace __gnu_parallel
return o;
}
} // end namespace
#endif
......@@ -41,11 +41,12 @@
#include <omp.h>
#include <parallel/settings.h>
#include <parallel/base.h>
namespace __gnu_parallel
{
/** @brief Embarrassingly parallel algorithm for random access
/** @brief Embarrassingly parallel algorithm for random access
* iterators, using hand-crafted parallelization by equal splitting
* the work.
*
......@@ -63,47 +64,57 @@ namespace __gnu_parallel
* std::count_n()).
* @return User-supplied functor (that may contain a part of the result).
*/
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
template<
typename RandomAccessIterator,
typename Op,
typename Fu,
typename Red,
typename Result>
Op
for_each_template_random_access_ed(RandomAccessIterator begin,
RandomAccessIterator end, Op o, Fu& f,
Red r, Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
for_each_template_random_access_ed(
RandomAccessIterator begin,
RandomAccessIterator end,
Op o, Fu& f, Red r, Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::
difference_type bound)
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::difference_type difference_type;
const difference_type length = end - begin;
const difference_type settings_threads = static_cast<difference_type>(get_max_threads());
const difference_type dmin = settings_threads < length ? settings_threads : length;
const difference_type dmax = dmin > 1 ? dmin : 1;
Result *thread_results;
thread_index_t num_threads = static_cast<thread_index_t>(dmax);
thread_index_t num_threads =
__gnu_parallel::min<difference_type>(get_max_threads(), length);
# pragma omp parallel num_threads(num_threads)
{
# pragma omp single
{
num_threads = omp_get_num_threads();
thread_results = new Result[num_threads];
}
Result *thread_results = new Result[num_threads];
thread_index_t iam = omp_get_thread_num();
#pragma omp parallel num_threads(num_threads)
{
// Neutral element.
Result reduct = Result();
thread_index_t p = num_threads;
thread_index_t iam = omp_get_thread_num();
difference_type start = iam * length / p;
difference_type limit = (iam == p - 1) ? length : (iam + 1) * length / p;
difference_type
start = equally_split_point(length, num_threads, iam),
stop = equally_split_point(length, num_threads, iam + 1);
if (start < limit)
if (start < stop)
{
reduct = f(o, begin + start);
start++;
++start;
}
for (; start < limit; start++)
for (; start < stop; ++start)
reduct = r(reduct, f(o, begin + start));
thread_results[iam] = reduct;
}
} //parallel
for (thread_index_t i = 0; i < num_threads; i++)
output = r(output, thread_results[i]);
......
......@@ -48,7 +48,7 @@ namespace __gnu_parallel
{
// Problem: there is no 0-element given.
/** @brief Base case prefix sum routine.
/** @brief Base case prefix sum routine.
* @param begin Begin iterator of input sequence.
* @param end End iterator of input sequence.
* @param result Begin iterator of output sequence.
......@@ -56,9 +56,13 @@ namespace __gnu_parallel
* @param value Start value. Must be passed since the neutral
* element is unknown in general.
* @return End iterator of output sequence. */
template<typename InputIterator, typename OutputIterator, typename BinaryOperation>
template<
typename InputIterator,
typename OutputIterator,
typename BinaryOperation>
inline OutputIterator
parallel_partial_sum_basecase(InputIterator begin, InputIterator end,
parallel_partial_sum_basecase(
InputIterator begin, InputIterator end,
OutputIterator result, BinaryOperation bin_op,
typename std::iterator_traits<InputIterator>::value_type value)
{
......@@ -75,7 +79,7 @@ namespace __gnu_parallel
return result;
}
/** @brief Parallel partial sum implementation, two-phase approach,
/** @brief Parallel partial sum implementation, two-phase approach,
no recursion.
* @param begin Begin iterator of input sequence.
* @param end End iterator of input sequence.
......@@ -85,31 +89,49 @@ namespace __gnu_parallel
* @param num_threads Number of threads to use.
* @return End iterator of output sequence.
*/
template<typename InputIterator, typename OutputIterator, typename BinaryOperation>
template<
typename InputIterator,
typename OutputIterator,
typename BinaryOperation>
OutputIterator
parallel_partial_sum_linear(InputIterator begin, InputIterator end,
parallel_partial_sum_linear(
InputIterator begin, InputIterator end,
OutputIterator result, BinaryOperation bin_op,
typename std::iterator_traits<InputIterator>::difference_type n, int num_threads)
typename std::iterator_traits<InputIterator>::difference_type n)
{
typedef std::iterator_traits<InputIterator> traits_type;
typedef typename traits_type::value_type value_type;
typedef typename traits_type::difference_type difference_type;
if (num_threads > (n - 1))
num_threads = static_cast<thread_index_t>(n - 1);
thread_index_t num_threads =
std::min<difference_type>(get_max_threads(), n - 1);
if (num_threads < 2)
{
*result = *begin;
return parallel_partial_sum_basecase(begin + 1, end, result + 1, bin_op, *begin);
return parallel_partial_sum_basecase(
begin + 1, end, result + 1, bin_op, *begin);
}
difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 2)));
difference_type* borders;
value_type* sums;
# pragma omp parallel num_threads(num_threads)
{
# pragma omp single
{
num_threads = omp_get_num_threads();
borders = new difference_type[num_threads + 2];
if (Settings::partial_sum_dilatation == 1.0f)
equally_split(n, num_threads + 1, borders);
else
{
difference_type chunk_length = (int)((double)n / ((double)num_threads + Settings::partial_sum_dilatation)), borderstart = n - num_threads * chunk_length;
difference_type chunk_length =
((double)n /
((double)num_threads + Settings::partial_sum_dilatation)),
borderstart = n - num_threads * chunk_length;
borders[0] = 0;
for (int i = 1; i < (num_threads + 1); i++)
{
......@@ -119,13 +141,13 @@ namespace __gnu_parallel
borders[num_threads + 1] = n;
}
value_type* sums = static_cast<value_type*>(::operator new(sizeof(value_type) * num_threads));
sums = static_cast<value_type*>(
::operator new(sizeof(value_type) * num_threads));
OutputIterator target_end;
} //single
#pragma omp parallel num_threads(num_threads)
{
int id = omp_get_thread_num();
if (id == 0)
int iam = omp_get_thread_num();
if (iam == 0)
{
*result = *begin;
parallel_partial_sum_basecase(begin + 1, begin + borders[1],
......@@ -134,44 +156,48 @@ namespace __gnu_parallel
}
else
{
sums[id] = std::accumulate(begin + borders[id] + 1,
begin + borders[id + 1],
*(begin + borders[id]),
sums[iam] = std::accumulate(begin + borders[iam] + 1,
begin + borders[iam + 1],
*(begin + borders[iam]),
bin_op, __gnu_parallel::sequential_tag());
}
#pragma omp barrier
# pragma omp barrier
#pragma omp single
parallel_partial_sum_basecase(sums + 1, sums + num_threads, sums + 1,
bin_op, sums[0]);
# pragma omp single
parallel_partial_sum_basecase(
sums + 1, sums + num_threads, sums + 1, bin_op, sums[0]);
#pragma omp barrier
# pragma omp barrier
// Still same team.
parallel_partial_sum_basecase(begin + borders[id + 1],
begin + borders[id + 2],
result + borders[id + 1], bin_op,
sums[id]);
}
parallel_partial_sum_basecase(begin + borders[iam + 1],
begin + borders[iam + 2],
result + borders[iam + 1], bin_op,
sums[iam]);
} //parallel
delete [] sums;
delete[] sums;
delete[] borders;
return result + n;
}
/** @brief Parallel partial sum front-end.
/** @brief Parallel partial sum front-end.
* @param begin Begin iterator of input sequence.
* @param end End iterator of input sequence.
* @param result Begin iterator of output sequence.
* @param bin_op Associative binary function.
* @return End iterator of output sequence. */
template<typename InputIterator, typename OutputIterator, typename BinaryOperation>
template<
typename InputIterator,
typename OutputIterator,
typename BinaryOperation>
OutputIterator
parallel_partial_sum(InputIterator begin, InputIterator end,
OutputIterator result, BinaryOperation bin_op)
{
_GLIBCXX_CALL(begin - end);
_GLIBCXX_CALL(begin - end)
typedef std::iterator_traits<InputIterator> traits_type;
typedef typename traits_type::value_type value_type;
......@@ -179,14 +205,11 @@ namespace __gnu_parallel
difference_type n = end - begin;
int num_threads = get_max_threads();
switch (Settings::partial_sum_algorithm)
{
case Settings::LINEAR:
// Need an initial offset.
return parallel_partial_sum_linear(begin, end, result, bin_op,
n, num_threads);
return parallel_partial_sum_linear(begin, end, result, bin_op, n);
default:
// Partial_sum algorithm not implemented.
_GLIBCXX_PARALLEL_ASSERT(0);
......
......@@ -45,21 +45,21 @@
#include <bits/stl_algo.h>
#include <parallel/parallel.h>
/** @brief Decide whether to declare certain variable volatile in this file. */
/** @brief Decide whether to declare certain variables volatile. */
#define _GLIBCXX_VOLATILE volatile
namespace __gnu_parallel
{
/** @brief Parallel implementation of std::partition.
/** @brief Parallel implementation of std::partition.
* @param begin Begin iterator of input sequence to split.
* @param end End iterator of input sequence to split.
* @param pred Partition predicate, possibly including some kind of pivot.
* @param max_num_threads Maximum number of threads to use for this task.
* @param num_threads Maximum number of threads to use for this task.
* @return Number of elements not fulfilling the predicate. */
template<typename RandomAccessIterator, typename Predicate>
inline typename std::iterator_traits<RandomAccessIterator>::difference_type
template<typename RandomAccessIterator, typename Predicate>
typename std::iterator_traits<RandomAccessIterator>::difference_type
parallel_partition(RandomAccessIterator begin, RandomAccessIterator end,
Predicate pred, thread_index_t max_num_threads)
Predicate pred, thread_index_t num_threads)
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type;
......@@ -74,25 +74,37 @@ namespace __gnu_parallel
_GLIBCXX_VOLATILE difference_type leftover_left, leftover_right;
_GLIBCXX_VOLATILE difference_type leftnew, rightnew;
bool* reserved_left, * reserved_right;
reserved_left = new bool[max_num_threads];
reserved_right = new bool[max_num_threads];
bool* reserved_left = NULL, * reserved_right = NULL;
difference_type chunk_size;
if (Settings::partition_chunk_share > 0.0)
chunk_size = std::max((difference_type)Settings::partition_chunk_size, (difference_type)((double)n * Settings::partition_chunk_share / (double)max_num_threads));
else
chunk_size = Settings::partition_chunk_size;
omp_lock_t result_lock;
omp_init_lock(&result_lock);
// At least good for two processors.
while (right - left + 1 >= 2 * max_num_threads * chunk_size)
//at least two chunks per thread
if(right - left + 1 >= 2 * num_threads * chunk_size)
# pragma omp parallel num_threads(num_threads)
{
# pragma omp single
{
num_threads = omp_get_num_threads();
reserved_left = new bool[num_threads];
reserved_right = new bool[num_threads];
if (Settings::partition_chunk_share > 0.0)
chunk_size = std::max<difference_type>(
Settings::partition_chunk_size,
(double)n * Settings::partition_chunk_share /
(double)num_threads);
else
chunk_size = Settings::partition_chunk_size;
}
while (right - left + 1 >= 2 * num_threads * chunk_size)
{
# pragma omp single
{
difference_type num_chunks = (right - left + 1) / chunk_size;
thread_index_t num_threads = (int)std::min((difference_type)max_num_threads, num_chunks / 2);
for (int r = 0; r < num_threads; r++)
{
......@@ -101,11 +113,11 @@ namespace __gnu_parallel
}
leftover_left = 0;
leftover_right = 0;
} //implicit barrier
#pragma omp parallel num_threads(num_threads)
{
// Private.
difference_type thread_left, thread_left_border, thread_right, thread_right_border;
difference_type thread_left, thread_left_border,
thread_right, thread_right_border;
thread_left = left + 1;
// Just to satisfy the condition below.
......@@ -150,12 +162,15 @@ namespace __gnu_parallel
// Swap as usual.
while (thread_left < thread_right)
{
while (pred(begin[thread_left]) && thread_left <= thread_left_border)
while (pred(begin[thread_left])
&& thread_left <= thread_left_border)
thread_left++;
while (!pred(begin[thread_right]) && thread_right >= thread_right_border)
while (!pred(begin[thread_right])
&& thread_right >= thread_right_border)
thread_right--;
if (thread_left > thread_left_border || thread_right < thread_right_border)
if (thread_left > thread_left_border
|| thread_right < thread_right_border)
// Fetch new chunk(s).
break;
......@@ -167,28 +182,29 @@ namespace __gnu_parallel
// Now swap the leftover chunks to the right places.
if (thread_left <= thread_left_border)
#pragma omp atomic
# pragma omp atomic
leftover_left++;
if (thread_right >= thread_right_border)
#pragma omp atomic
# pragma omp atomic
leftover_right++;
#pragma omp barrier
# pragma omp barrier
#pragma omp single
# pragma omp single
{
leftnew = left - leftover_left * chunk_size;
rightnew = right + leftover_right * chunk_size;
}
#pragma omp barrier
# pragma omp barrier
// <=> thread_left_border + (chunk_size - 1) >= leftnew
if (thread_left <= thread_left_border
&& thread_left_border >= leftnew)
{
// Chunk already in place, reserve spot.
reserved_left[(left - (thread_left_border + 1)) / chunk_size] = true;
reserved_left[(left - (thread_left_border + 1)) / chunk_size]
= true;
}
// <=> thread_right_border - (chunk_size - 1) <= rightnew
......@@ -196,12 +212,15 @@ namespace __gnu_parallel
&& thread_right_border <= rightnew)
{
// Chunk already in place, reserve spot.
reserved_right[((thread_right_border - 1) - right) / chunk_size] = true;
reserved_right
[((thread_right_border - 1) - right) / chunk_size]
= true;
}
#pragma omp barrier
# pragma omp barrier
if (thread_left <= thread_left_border && thread_left_border < leftnew)
if (thread_left <= thread_left_border
&& thread_left_border < leftnew)
{
// Find spot and swap.
difference_type swapstart = -1;
......@@ -219,7 +238,10 @@ namespace __gnu_parallel
_GLIBCXX_PARALLEL_ASSERT(swapstart != -1);
#endif
std::swap_ranges(begin + thread_left_border - (chunk_size - 1), begin + thread_left_border + 1, begin + swapstart);
std::swap_ranges(
begin + thread_left_border - (chunk_size - 1),
begin + thread_left_border + 1,
begin + swapstart);
}
if (thread_right >= thread_right_border
......@@ -241,12 +263,14 @@ namespace __gnu_parallel
_GLIBCXX_PARALLEL_ASSERT(swapstart != -1);
#endif
std::swap_ranges(begin + thread_right_border, begin + thread_right_border + chunk_size, begin + swapstart);
std::swap_ranges(begin + thread_right_border,
begin + thread_right_border + chunk_size,
begin + swapstart);
}
#if _GLIBCXX_ASSERTIONS
#pragma omp barrier
# pragma omp barrier
#pragma omp single
# pragma omp single
{
for (int r = 0; r < leftover_left; r++)
_GLIBCXX_PARALLEL_ASSERT(reserved_left[r]);
......@@ -254,14 +278,16 @@ namespace __gnu_parallel
_GLIBCXX_PARALLEL_ASSERT(reserved_right[r]);
}
#pragma omp barrier
# pragma omp barrier
#endif
#pragma omp barrier
# pragma omp barrier
left = leftnew;
right = rightnew;
}
} // end "recursion"
# pragma omp flush(left, right)
} // end "recursion" //parallel
difference_type final_left = left, final_right = right;
......@@ -298,14 +324,14 @@ namespace __gnu_parallel
return final_left + 1;
}
/**
/**
* @brief Parallel implementation of std::nth_element().
* @param begin Begin iterator of input sequence.
* @param nth Iterator of element that must be in position afterwards.
* @param end End iterator of input sequence.
* @param comp Comparator.
*/
template<typename RandomAccessIterator, typename Comparator>
template<typename RandomAccessIterator, typename Comparator>
void
parallel_nth_element(RandomAccessIterator begin, RandomAccessIterator nth,
RandomAccessIterator end, Comparator comp)
......@@ -377,12 +403,12 @@ namespace __gnu_parallel
__gnu_sequential::sort(begin, end, comp);
}
/** @brief Parallel implementation of std::partial_sort().
* @param begin Begin iterator of input sequence.
* @param middle Sort until this position.
* @param end End iterator of input sequence.
* @param comp Comparator. */
template<typename RandomAccessIterator, typename Comparator>
/** @brief Parallel implementation of std::partial_sort().
* @param begin Begin iterator of input sequence.
* @param middle Sort until this position.
* @param end End iterator of input sequence.
* @param comp Comparator. */
template<typename RandomAccessIterator, typename Comparator>
void
parallel_partial_sort(RandomAccessIterator begin, RandomAccessIterator middle, RandomAccessIterator end, Comparator comp)
{
......
......@@ -53,11 +53,17 @@ namespace __gnu_parallel
* this part.
*/
template<typename RandomAccessIterator, typename Comparator>
inline typename std::iterator_traits<RandomAccessIterator>::difference_type
parallel_sort_qs_divide(RandomAccessIterator begin, RandomAccessIterator end,
inline
typename std::iterator_traits<RandomAccessIterator>::difference_type
parallel_sort_qs_divide(
RandomAccessIterator begin,
RandomAccessIterator end,
Comparator comp,
typename std::iterator_traits<RandomAccessIterator>::difference_type pivot_rank,
typename std::iterator_traits<RandomAccessIterator>::difference_type num_samples, thread_index_t num_threads)
typename std::iterator_traits<RandomAccessIterator>::difference_type
pivot_rank,
typename std::iterator_traits<RandomAccessIterator>::difference_type
num_samples,
thread_index_t num_threads)
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type;
......@@ -65,20 +71,24 @@ namespace __gnu_parallel
difference_type n = end - begin;
num_samples = std::min(num_samples, n);
value_type* samples = static_cast<value_type*>(__builtin_alloca(sizeof(value_type) * num_samples));
// Allocate uninitialized, to avoid default constructor.
value_type* samples = static_cast<value_type*>(
operator new(num_samples * sizeof(value_type)));
for (difference_type s = 0; s < num_samples; s++)
{
const unsigned long long index = static_cast<unsigned long long>(s)
* n / num_samples;
samples[s] = begin[index];
new(samples + s) value_type(begin[index]);
}
__gnu_sequential::sort(samples, samples + num_samples, comp);
value_type& pivot = samples[pivot_rank * num_samples / n];
__gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, pivot);
__gnu_parallel::binder2nd<Comparator, value_type, value_type, bool>
pred(comp, pivot);
difference_type split = parallel_partition(begin, end, pred, num_threads);
return split;
......@@ -93,7 +103,10 @@ namespace __gnu_parallel
*/
template<typename RandomAccessIterator, typename Comparator>
inline void
parallel_sort_qs_conquer(RandomAccessIterator begin, RandomAccessIterator end, Comparator comp, int num_threads)
parallel_sort_qs_conquer(RandomAccessIterator begin,
RandomAccessIterator end,
Comparator comp,
thread_index_t num_threads)
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type;
......@@ -110,24 +123,27 @@ namespace __gnu_parallel
if (n <= 1)
return;
thread_index_t num_processors_left;
thread_index_t num_threads_left;
if ((num_threads % 2) == 1)
num_processors_left = num_threads / 2 + 1;
num_threads_left = num_threads / 2 + 1;
else
num_processors_left = num_threads / 2;
num_threads_left = num_threads / 2;
pivot_rank = n * num_processors_left / num_threads;
pivot_rank = n * num_threads_left / num_threads;
difference_type split = parallel_sort_qs_divide(begin, end, comp, pivot_rank,
Settings::sort_qs_num_samples_preset, num_threads);
difference_type split = parallel_sort_qs_divide(
begin, end, comp, pivot_rank,
Settings::sort_qs_num_samples_preset, num_threads);
#pragma omp parallel sections
{
#pragma omp section
parallel_sort_qs_conquer(begin, begin + split, comp, num_processors_left);
parallel_sort_qs_conquer(begin, begin + split,
comp, num_threads_left);
#pragma omp section
parallel_sort_qs_conquer(begin + split, end, comp, num_threads - num_processors_left);
parallel_sort_qs_conquer(begin + split, end,
comp, num_threads - num_threads_left);
}
}
......@@ -143,9 +159,12 @@ Settings::sort_qs_num_samples_preset, num_threads);
*/
template<typename RandomAccessIterator, typename Comparator>
inline void
parallel_sort_qs(RandomAccessIterator begin, RandomAccessIterator end,
parallel_sort_qs(
RandomAccessIterator begin,
RandomAccessIterator end,
Comparator comp,
typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads)
typename std::iterator_traits<RandomAccessIterator>::difference_type n,
int num_threads)
{
_GLIBCXX_CALL(n)
......@@ -165,10 +184,7 @@ Settings::sort_qs_num_samples_preset, num_threads);
// Hard to avoid.
omp_set_num_threads(num_threads);
bool old_nested = (omp_get_nested() != 0);
omp_set_nested(true);
parallel_sort_qs_conquer(begin, begin + n, comp, num_threads);
omp_set_nested(old_nested);
}
} //namespace __gnu_parallel
......
......@@ -53,7 +53,7 @@ namespace __gnu_parallel
* @param length Length of sequence to search for.
* @param advances Returned offsets.
*/
template<typename RandomAccessIterator, typename _DifferenceTp>
template<typename RandomAccessIterator, typename _DifferenceTp>
void
calc_borders(RandomAccessIterator elements, _DifferenceTp length,
_DifferenceTp* off)
......@@ -81,7 +81,10 @@ namespace __gnu_parallel
* @param end2 End iterator of second sequence.
* @param pred Find predicate.
* @return Place of finding in first sequences. */
template<typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename Pred>
template<
typename _RandomAccessIterator1,
typename _RandomAccessIterator2,
typename Pred>
_RandomAccessIterator1
search_template(_RandomAccessIterator1 begin1, _RandomAccessIterator1 end1,
_RandomAccessIterator2 begin2, _RandomAccessIterator2 end2,
......@@ -103,27 +106,34 @@ namespace __gnu_parallel
// Where is first occurrence of pattern? defaults to end.
difference_type result = (end1 - begin1);
difference_type *splitters;
// Pattern too long.
if (input_length < 0)
return end1;
thread_index_t num_threads = std::max<difference_type>(1, std::min<difference_type>(input_length, __gnu_parallel::get_max_threads()));
omp_lock_t result_lock;
omp_init_lock(&result_lock);
difference_type borders[num_threads + 1];
__gnu_parallel::equally_split(input_length, num_threads, borders);
thread_index_t num_threads =
std::max<difference_type>(1,
std::min<difference_type>(input_length, get_max_threads()));
difference_type advances[pattern_length];
calc_borders(begin2, pattern_length, advances);
#pragma omp parallel num_threads(num_threads)
# pragma omp parallel num_threads(num_threads)
{
# pragma omp single
{
num_threads = omp_get_num_threads();
splitters = new difference_type[num_threads + 1];
equally_split(input_length, num_threads, splitters);
}
thread_index_t iam = omp_get_thread_num();
difference_type start = borders[iam], stop = borders[iam + 1];
difference_type start = splitters[iam], stop = splitters[iam + 1];
difference_type pos_in_pattern = 0;
bool found_pattern = false;
......@@ -131,11 +141,12 @@ namespace __gnu_parallel
while (start <= stop && !found_pattern)
{
// Get new value of result.
#pragma omp flush(result)
#pragma omp flush(result)
// No chance for this thread to find first occurrence.
if (result < start)
break;
while (pred(begin1[start + pos_in_pattern], begin2[pos_in_pattern]))
while (pred(begin1[start + pos_in_pattern],
begin2[pos_in_pattern]))
{
++pos_in_pattern;
if (pos_in_pattern == pattern_length)
......@@ -151,12 +162,15 @@ namespace __gnu_parallel
}
// Make safe jump.
start += (pos_in_pattern - advances[pos_in_pattern]);
pos_in_pattern = (advances[pos_in_pattern] < 0) ? 0 : advances[pos_in_pattern];
}
pos_in_pattern =
(advances[pos_in_pattern] < 0) ? 0 : advances[pos_in_pattern];
}
} //parallel
omp_destroy_lock(&result_lock);
delete[] splitters;
// Return iterator on found element.
return (begin1 + result);
}
......
......@@ -44,13 +44,16 @@
namespace __gnu_parallel
{
/** @brief Parallel std::unique_copy(), without explicit equality predicate.
/** @brief Parallel std::unique_copy(), w/o explicit equality predicate.
* @param first Begin iterator of input sequence.
* @param last End iterator of input sequence.
* @param result Begin iterator of result sequence.
* @param binary_pred Equality predicate.
* @return End iterator of result sequence. */
template<typename InputIterator, class OutputIterator, class BinaryPredicate>
template<
typename InputIterator,
class OutputIterator,
class BinaryPredicate>
inline OutputIterator
parallel_unique_copy(InputIterator first, InputIterator last,
OutputIterator result, BinaryPredicate binary_pred)
......@@ -62,20 +65,27 @@ namespace __gnu_parallel
typedef typename traits_type::difference_type difference_type;
difference_type size = last - first;
int num_threads = __gnu_parallel::get_max_threads();
difference_type counter[num_threads + 1];
if (size == 0)
return result;
// Let the first thread process two parts.
difference_type borders[num_threads + 2];
__gnu_parallel::equally_split(size, num_threads + 1, borders);
difference_type *counter;
difference_type *borders;
thread_index_t num_threads = get_max_threads();
// First part contains at least one element.
#pragma omp parallel num_threads(num_threads)
# pragma omp parallel num_threads(num_threads)
{
int iam = omp_get_thread_num();
# pragma omp single
{
num_threads = omp_get_num_threads();
borders = new difference_type[num_threads + 2];
equally_split(size, num_threads + 1, borders);
counter = new difference_type[num_threads + 1];
}
thread_index_t iam = omp_get_thread_num();
difference_type begin, end;
......@@ -83,6 +93,7 @@ namespace __gnu_parallel
// Needed for position in output
difference_type i = 0;
OutputIterator out = result;
if (iam == 0)
{
begin = borders[0] + 1; // == 1
......@@ -120,7 +131,7 @@ namespace __gnu_parallel
// Last part still untouched.
difference_type begin_output;
#pragma omp barrier
# pragma omp barrier
// Store result in output on calculated positions.
begin_output = 0;
......@@ -170,15 +181,17 @@ namespace __gnu_parallel
for (int t = 0; t < num_threads + 1; t++)
end_output += counter[t];
delete[] borders;
return result + end_output;
}
/** @brief Parallel std::unique_copy(), without explicit equality predicate
/** @brief Parallel std::unique_copy(), without explicit equality predicate
* @param first Begin iterator of input sequence.
* @param last End iterator of input sequence.
* @param result Begin iterator of result sequence.
* @return End iterator of result sequence. */
template<typename InputIterator, class OutputIterator>
template<typename InputIterator, class OutputIterator>
inline OutputIterator
parallel_unique_copy(InputIterator first, InputIterator last,
OutputIterator result)
......
......@@ -55,8 +55,8 @@ namespace __gnu_parallel
#define _GLIBCXX_JOB_VOLATILE volatile
/** @brief One job for a certain thread. */
template<typename _DifferenceTp>
/** @brief One job for a certain thread. */
template<typename _DifferenceTp>
struct Job
{
typedef _DifferenceTp difference_type;
......@@ -78,7 +78,7 @@ namespace __gnu_parallel
_GLIBCXX_JOB_VOLATILE difference_type load;
};
/** @brief Work stealing algorithm for random access iterators.
/** @brief Work stealing algorithm for random access iterators.
*
* Uses O(1) additional memory. Synchronization at job lists is
* done with atomic operations.
......@@ -96,13 +96,20 @@ namespace __gnu_parallel
* std::count_n()).
* @return User-supplied functor (that may contain a part of the result).
*/
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
template<
typename RandomAccessIterator,
typename Op,
typename Fu,
typename Red,
typename Result>
Op
for_each_template_random_access_workstealing(RandomAccessIterator begin,
for_each_template_random_access_workstealing(
RandomAccessIterator begin,
RandomAccessIterator end,
Op op, Fu& f, Red r,
Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
typename std::iterator_traits<RandomAccessIterator>::difference_type
bound)
{
_GLIBCXX_CALL(end - begin)
......@@ -110,34 +117,43 @@ namespace __gnu_parallel
typedef typename traits_type::difference_type difference_type;
difference_type chunk_size = static_cast<difference_type>(Settings::workstealing_chunk_size);
difference_type chunk_size =
static_cast<difference_type>(Settings::workstealing_chunk_size);
// How many jobs?
difference_type length = (bound < 0) ? (end - begin) : bound;
// To avoid false sharing in a cache line.
const int stride = Settings::cache_line_size * 10 / sizeof(Job<difference_type>) + 1;
const int stride =
Settings::cache_line_size * 10 / sizeof(Job<difference_type>) + 1;
// Total number of threads currently working.
thread_index_t busy = 0;
thread_index_t num_threads = get_max_threads();
difference_type num_threads_min = num_threads < end - begin ? num_threads : end - begin;
Job<difference_type> *job;
omp_lock_t output_lock;
omp_init_lock(&output_lock);
// No more threads than jobs, at least one thread.
difference_type num_threads_max = num_threads_min > 1 ? num_threads_min : 1;
num_threads = static_cast<thread_index_t>(num_threads_max);
// Create job description array.
Job<difference_type> *job = new Job<difference_type>[num_threads * stride];
// Write base value to output.
output = base;
#pragma omp parallel shared(busy) num_threads(num_threads)
// No more threads than jobs, at least one thread.
thread_index_t num_threads =
__gnu_parallel::max<thread_index_t>(1,
__gnu_parallel::min<difference_type>(length, get_max_threads()));
# pragma omp parallel shared(busy) num_threads(num_threads)
{
# pragma omp single
{
num_threads = omp_get_num_threads();
// Create job description array.
job = new Job<difference_type>[num_threads * stride];
}
// Initialization phase.
// Flags for every thread if it is doing productive work.
......@@ -158,19 +174,22 @@ namespace __gnu_parallel
// Number of elements to steal in one attempt.
difference_type steal;
// Every thread has its own random number generator (modulo num_threads).
// Every thread has its own random number generator
// (modulo num_threads).
random_number rand_gen(iam, num_threads);
#pragma omp atomic
// This thread is currently working.
# pragma omp atomic
busy++;
iam_working = true;
// How many jobs per thread? last thread gets the rest.
my_job.first = static_cast<difference_type>(iam * (length / num_threads));
my_job.first =
static_cast<difference_type>(iam * (length / num_threads));
my_job.last = (iam == (num_threads - 1)) ? (length - 1) : ((iam + 1) * (length / num_threads) - 1);
my_job.last = (iam == (num_threads - 1)) ?
(length - 1) : ((iam + 1) * (length / num_threads) - 1);
my_job.load = my_job.last - my_job.first + 1;
// Init result with first value (to have a base value for reduction).
......@@ -185,26 +204,29 @@ namespace __gnu_parallel
RandomAccessIterator current;
#pragma omp barrier
# pragma omp barrier
// Actual work phase
// Work on own or stolen start
while (busy > 0)
{
// Work until no productive thread left.
#pragma omp flush(busy)
# pragma omp flush(busy)
// Thread has own work to do
while (my_job.first <= my_job.last)
{
// fetch-and-add call
// Reserve current job block (size chunk_size) in my queue.
difference_type current_job = fetch_and_add<difference_type>(&(my_job.first), chunk_size);
difference_type current_job =
fetch_and_add<difference_type>(&(my_job.first), chunk_size);
// Update load, to make the three values consistent,
// first might have been changed in the meantime
my_job.load = my_job.last - my_job.first + 1;
for (difference_type job_counter = 0; job_counter < chunk_size && current_job <= my_job.last; job_counter++)
for (difference_type job_counter = 0;
job_counter < chunk_size && current_job <= my_job.last;
job_counter++)
{
// Yes: process it!
current = begin + current_job;
......@@ -214,15 +236,14 @@ namespace __gnu_parallel
result = r(result, f(op, current));
}
#pragma omp flush(busy)
# pragma omp flush(busy)
}
// After reaching this point, a thread's job list is empty.
if (iam_working)
{
#pragma omp atomic
// This thread no longer has work.
# pragma omp atomic
busy--;
iam_working = false;
......@@ -231,16 +252,17 @@ namespace __gnu_parallel
difference_type supposed_first, supposed_last, supposed_load;
do
{
// Find random nonempty deque (not own) and do consistency check.
// Find random nonempty deque (not own), do consistency check.
yield();
#pragma omp flush(busy)
# pragma omp flush(busy)
victim = rand_gen();
supposed_first = job[victim * stride].first;
supposed_last = job[victim * stride].last;
supposed_load = job[victim * stride].load;
}
while (busy > 0
&& ((supposed_load <= 0) || ((supposed_first + supposed_load - 1) != supposed_last)));
&& ((supposed_load <= 0)
|| ((supposed_first + supposed_load - 1) != supposed_last)));
if (busy == 0)
break;
......@@ -251,40 +273,30 @@ namespace __gnu_parallel
// Number of elements to steal (at least one).
steal = (supposed_load < 2) ? 1 : supposed_load / 2;
// Protects against stealing threads
// omp_set_lock(&(job[victim * stride].lock));
// Push victim's start forward.
difference_type stolen_first = fetch_and_add<difference_type>(&(job[victim * stride].first), steal);
difference_type stolen_try = stolen_first + steal - difference_type(1);
// Protects against working thread
// omp_unset_lock(&(job[victim * stride].lock));
difference_type stolen_first =
fetch_and_add<difference_type>(
&(job[victim * stride].first), steal);
difference_type stolen_try =
stolen_first + steal - difference_type(1);
my_job.first = stolen_first;
// Avoid std::min dependencies.
my_job.last = stolen_try < supposed_last ? stolen_try : supposed_last;
my_job.last = __gnu_parallel::min(stolen_try, supposed_last);
my_job.load = my_job.last - my_job.first + 1;
//omp_unset_lock(&(my_job.lock));
#pragma omp atomic
// Has potential work again.
# pragma omp atomic
busy++;
iam_working = true;
#pragma omp flush(busy)
# pragma omp flush(busy)
}
#pragma omp flush(busy)
# pragma omp flush(busy)
} // end while busy > 0
// Add accumulated result to output.
omp_set_lock(&output_lock);
output = r(output, result);
omp_unset_lock(&output_lock);
//omp_destroy_lock(&(my_job.lock));
}
delete[] job;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment