Commit e683ee2a by Johannes Singler Committed by Johannes Singler

re PR libstdc++/33893 ([parallel mode] Algorithms rely on omp_set_dynamic(false))

2007-11-22  Johannes Singler  <singler@ira.uka.de>

        PR libstdc++/33893
        * include/parallel/multiway_merge.h: made omp_dynamic-safe
        * include/parallel/workstealing.h: made omp_dynamic-safe
        * include/parallel/base.h: infrastructure, cleanup
        * include/parallel/par_loop.h: made omp_dynamic-safe
        * include/parallel/features.h: activate loser tree variant
        * include/parallel/quicksort.h: made omp_dynamic-safe
        * include/parallel/compiletime_settings.h: settings overridable
        * include/parallel/equally_split.h: made omp_dynamic-safe
        * include/parallel/omp_loop_static.h: made omp_dynamic-safe
        * include/parallel/random_shuffle.h: made omp_dynamic-safe
        * include/parallel/balanced_quicksort.h: made omp_dynamic-safe
        * include/parallel/set_operations.h: made omp_dynamic-safe
        * include/parallel/unique_copy.h: made omp_dynamic-safe
        * include/parallel/multiway_mergesort.h: made omp_dynamic-safe
        * include/parallel/search.h: made omp_dynamic-safe
        * include/parallel/partition.h: made omp_dynamic-safe
        * include/parallel/partial_sum.h: made omp_dynamic-safe
        * include/parallel/find.h: made omp_dynamic-safe
        * include/parallel/omp_loop.h: made omp_dynamic-safe
        * include/parallel/losertree.h: avoid default constructor

From-SVN: r130347
parent 7861a5ce
2007-11-22 Johannes Singler <singler@ira.uka.de>
PR libstdc++/33893
* include/parallel/multiway_merge.h: made omp_dynamic-safe
* include/parallel/workstealing.h: made omp_dynamic-safe
* include/parallel/base.h: infrastructure, cleanup
* include/parallel/par_loop.h: made omp_dynamic-safe
* include/parallel/features.h: activate loser tree variant
* include/parallel/quicksort.h: made omp_dynamic-safe
* include/parallel/compiletime_settings.h: settings overridable
* include/parallel/equally_split.h: made omp_dynamic-safe
* include/parallel/omp_loop_static.h: made omp_dynamic-safe
* include/parallel/random_shuffle.h: made omp_dynamic-safe
* include/parallel/balanced_quicksort.h: made omp_dynamic-safe
* include/parallel/set_operations.h: made omp_dynamic-safe
* include/parallel/unique_copy.h: made omp_dynamic-safe
* include/parallel/multiway_mergesort.h: made omp_dynamic-safe
* include/parallel/search.h: made omp_dynamic-safe
* include/parallel/partition.h: made omp_dynamic-safe
* include/parallel/partial_sum.h: made omp_dynamic-safe
* include/parallel/find.h: made omp_dynamic-safe
* include/parallel/omp_loop.h: made omp_dynamic-safe
* include/parallel/losertree.h: avoid default constructor
2007-11-21 Jonathan Wakely <jwakely.gcc@gmail.com>
* docs/html/17_intro/C++STYLE: Fix typos.
......
......@@ -39,7 +39,7 @@
#include <cstdio>
/** @brief Determine verbosity level of the parallel mode.
* Level 1 prints a message each time when entering a parallel-mode function. */
* Level 1 prints a message each time a parallel-mode function is entered. */
#define _GLIBCXX_VERBOSE_LEVEL 0
/** @def _GLIBCXX_CALL
......@@ -50,27 +50,40 @@
#define _GLIBCXX_CALL(n)
#endif
#if (_GLIBCXX_VERBOSE_LEVEL == 1)
#define _GLIBCXX_CALL(n) printf(" %s:\niam = %d, n = %ld, num_threads = %d\n", __PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads());
#define _GLIBCXX_CALL(n) \
printf(" %s:\niam = %d, n = %ld, num_threads = %d\n", \
__PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads());
#endif
#ifndef _GLIBCXX_SCALE_DOWN_FPU
/** @brief Use floating-point scaling instead of modulo for mapping
* random numbers to a range. This can be faster on certain CPUs. */
#define _GLIBCXX_SCALE_DOWN_FPU 0
#endif
#ifndef _GLIBCXX_ASSERTIONS
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
* Should be switched on only locally. */
#define _GLIBCXX_ASSERTIONS 0
#endif
#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
* Consider the size of the L1 cache for __gnu_parallel::parallel_random_shuffle(). */
* Consider the size of the L1 cache for
* __gnu_parallel::parallel_random_shuffle(). */
#define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 0
#endif
#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
* Consider the size of the TLB for __gnu_parallel::parallel_random_shuffle(). */
* Consider the size of the TLB for
* __gnu_parallel::parallel_random_shuffle(). */
#define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 0
#endif
#ifndef _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
/** @brief First copy the data, sort it locally, and merge it back
* (0); or copy it back after everything is done (1).
*
* Recommendation: 0 */
#define _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST 0
#endif
......@@ -39,30 +39,58 @@
namespace __gnu_parallel
{
/** @brief Function to split a sequence into parts of almost equal size.
*
* The resulting sequence s of length p+1 contains the splitting
* positions when splitting the range [0,n) into parts of almost
* equal size (plus minus 1). The first entry is 0, the last one
* n. There may result empty parts.
* @param n Number of elements
* @param p Number of parts
* @param s Splitters
* @returns End of splitter sequence, i. e. @c s+p+1 */
template<typename _DifferenceTp, typename OutputIterator>
/** @brief Function to split a sequence into parts of almost equal size.
*
* The resulting sequence s of length num_threads+1 contains the splitting
* positions when splitting the range [0,n) into parts of almost
* equal size (plus minus 1). The first entry is 0, the last one
* n. There may result empty parts.
* @param n Number of elements
* @param num_threads Number of parts
* @param s Splitters
* @returns End of splitter sequence, i. e. @c s+num_threads+1 */
template<typename difference_type, typename OutputIterator>
OutputIterator
equally_split(_DifferenceTp n, thread_index_t p, OutputIterator s)
equally_split(difference_type n,
thread_index_t num_threads,
OutputIterator s)
{
typedef _DifferenceTp difference_type;
difference_type chunk_length = n / p, split = n % p, start = 0;
for (int i = 0; i < p; i++)
difference_type chunk_length = n / num_threads,
num_longer_chunks = n % num_threads,
pos = 0;
for (thread_index_t i = 0; i < num_threads; ++i)
{
*s++ = start;
start += (difference_type(i) < split) ? (chunk_length + 1) : chunk_length;
*s++ = pos;
pos += (i < num_longer_chunks) ? (chunk_length + 1) : chunk_length;
}
*s++ = n;
return s;
}
/** @brief Function to split a sequence into parts of almost equal size.
*
* Returns the position of the splitting point between
* thread number thread_no (included) and
* thread number thread_no+1 (excluded).
* @param n Number of elements
* @param num_threads Number of parts
* @returns Splitting point */
template<typename difference_type>
difference_type
equally_split_point(difference_type n,
thread_index_t num_threads,
thread_index_t thread_no)
{
difference_type chunk_length = n / num_threads,
num_longer_chunks = n % num_threads;
if(thread_no < num_longer_chunks)
return thread_no * (chunk_length + 1);
else
return num_longer_chunks * (chunk_length + 1)
+ (thread_no - num_longer_chunks) * chunk_length;
}
}
#endif
......@@ -66,7 +66,7 @@
* @brief Include guarded (sequences may run empty) loser tree,
* moving objects.
* @see __gnu_parallel::Settings multiway_merge_algorithm */
#define _GLIBCXX_LOSER_TREE 0
#define _GLIBCXX_LOSER_TREE 1
#endif
#ifndef _GLIBCXX_LOSER_TREE_EXPLICIT
......
......@@ -43,54 +43,71 @@
#include <parallel/settings.h>
#include <parallel/basic_iterator.h>
#include <parallel/base.h>
namespace __gnu_parallel
{
/** @brief Embarrassingly parallel algorithm for random access
* iterators, using an OpenMP for loop.
*
* @param begin Begin iterator of element sequence.
* @param end End iterator of element sequence.
* @param o User-supplied functor (comparator, predicate, adding
* functor, etc.).
* @param f Functor to "process" an element with op (depends on
* desired functionality, e. g. for std::for_each(), ...).
* @param r Functor to "add" a single result to the already
* processed elements (depends on functionality).
* @param base Base value for reduction.
* @param output Pointer to position where final result is written to
* @param bound Maximum number of elements processed (e. g. for
* std::count_n()).
* @return User-supplied functor (that may contain a part of the result).
*/
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
/** @brief Embarrassingly parallel algorithm for random access
* iterators, using an OpenMP for loop.
*
* @param begin Begin iterator of element sequence.
* @param end End iterator of element sequence.
* @param o User-supplied functor (comparator, predicate, adding
* functor, etc.).
* @param f Functor to "process" an element with op (depends on
* desired functionality, e. g. for std::for_each(), ...).
* @param r Functor to "add" a single result to the already
* processed elements (depends on functionality).
* @param base Base value for reduction.
* @param output Pointer to position where final result is written to
* @param bound Maximum number of elements processed (e. g. for
* std::count_n()).
* @return User-supplied functor (that may contain a part of the result).
*/
template<typename RandomAccessIterator,
typename Op,
typename Fu,
typename Red,
typename Result>
Op
for_each_template_random_access_omp_loop(RandomAccessIterator begin, RandomAccessIterator end, Op o, Fu& f, Red r, Result base, Result& output, typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
for_each_template_random_access_omp_loop(
RandomAccessIterator begin,
RandomAccessIterator end,
Op o, Fu& f, Red r, Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::
difference_type bound)
{
typedef typename std::iterator_traits<RandomAccessIterator>::difference_type difference_type;
typedef typename
std::iterator_traits<RandomAccessIterator>::difference_type
difference_type;
thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : static_cast<thread_index_t>((end - begin));
Result *thread_results = new Result[num_threads];
difference_type length = end - begin;
thread_index_t num_threads =
__gnu_parallel::min<difference_type>(get_max_threads(), length);
for (thread_index_t i = 0; i < num_threads; i++)
Result *thread_results;
# pragma omp parallel num_threads(num_threads)
{
thread_results[i] = r(thread_results[i], f(o, begin+i));
}
#pragma omp parallel num_threads(num_threads)
{
#pragma omp for schedule(dynamic, Settings::workstealing_chunk_size)
for (difference_type pos = 0; pos < length; pos++)
{
thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos));
}
}
# pragma omp single
{
num_threads = omp_get_num_threads();
thread_results = new Result[num_threads];
for (thread_index_t i = 0; i < num_threads; i++)
thread_results[i] = Result();
}
thread_index_t iam = omp_get_thread_num();
# pragma omp for schedule(dynamic, Settings::workstealing_chunk_size)
for (difference_type pos = 0; pos < length; pos++)
thread_results[iam] =
r(thread_results[iam], f(o, begin+pos));
} //parallel
for (thread_index_t i = 0; i < num_threads; i++)
{
output = r(output, thread_results[i]);
}
output = r(output, thread_results[i]);
delete [] thread_results;
......@@ -100,6 +117,7 @@ namespace __gnu_parallel
return o;
}
} // end namespace
#endif
......@@ -64,39 +64,50 @@ namespace __gnu_parallel
* std::count_n()).
* @return User-supplied functor (that may contain a part of the result).
*/
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
template<typename RandomAccessIterator,
typename Op,
typename Fu,
typename Red,
typename Result>
Op
for_each_template_random_access_omp_loop_static(RandomAccessIterator begin,
RandomAccessIterator end,
Op o, Fu& f, Red r,
Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
for_each_template_random_access_omp_loop_static(
RandomAccessIterator begin,
RandomAccessIterator end,
Op o, Fu& f, Red r, Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::
difference_type bound)
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::difference_type difference_type;
typedef typename
std::iterator_traits<RandomAccessIterator>::difference_type
difference_type;
thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : (end - begin);
Result *thread_results = new Result[num_threads];
difference_type length = end - begin;
thread_index_t num_threads =
std::min<difference_type>(get_max_threads(), length);
for (thread_index_t i = 0; i < num_threads; i++)
Result *thread_results;
# pragma omp parallel num_threads(num_threads)
{
thread_results[i] = r(thread_results[i], f(o, begin+i));
}
#pragma omp parallel num_threads(num_threads)
{
#pragma omp for schedule(static, Settings::workstealing_chunk_size)
for (difference_type pos = 0; pos < length; pos++)
{
thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos));
}
}
# pragma omp single
{
num_threads = omp_get_num_threads();
thread_results = new Result[num_threads];
for (thread_index_t i = 0; i < num_threads; i++)
thread_results[i] = Result();
}
thread_index_t iam = omp_get_thread_num();
# pragma omp for schedule(static, Settings::workstealing_chunk_size)
for (difference_type pos = 0; pos < length; pos++)
thread_results[iam] =
r(thread_results[iam], f(o, begin+pos));
} //parallel
for (thread_index_t i = 0; i < num_threads; i++)
{
output = r(output, thread_results[i]);
}
output = r(output, thread_results[i]);
delete [] thread_results;
......@@ -106,6 +117,7 @@ namespace __gnu_parallel
return o;
}
} // end namespace
#endif
......@@ -41,69 +41,80 @@
#include <omp.h>
#include <parallel/settings.h>
#include <parallel/base.h>
namespace __gnu_parallel
{
/** @brief Embarrassingly parallel algorithm for random access
* iterators, using hand-crafted parallelization by equal splitting
* the work.
*
* @param begin Begin iterator of element sequence.
* @param end End iterator of element sequence.
* @param o User-supplied functor (comparator, predicate, adding
* functor, ...)
* @param f Functor to "process" an element with op (depends on
* desired functionality, e. g. for std::for_each(), ...).
* @param r Functor to "add" a single result to the already
* processed elements (depends on functionality).
* @param base Base value for reduction.
* @param output Pointer to position where final result is written to
* @param bound Maximum number of elements processed (e. g. for
* std::count_n()).
* @return User-supplied functor (that may contain a part of the result).
*/
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
/** @brief Embarrassingly parallel algorithm for random access
* iterators, using hand-crafted parallelization by equal splitting
* the work.
*
* @param begin Begin iterator of element sequence.
* @param end End iterator of element sequence.
* @param o User-supplied functor (comparator, predicate, adding
* functor, ...)
* @param f Functor to "process" an element with op (depends on
* desired functionality, e. g. for std::for_each(), ...).
* @param r Functor to "add" a single result to the already
* processed elements (depends on functionality).
* @param base Base value for reduction.
* @param output Pointer to position where final result is written to
* @param bound Maximum number of elements processed (e. g. for
* std::count_n()).
* @return User-supplied functor (that may contain a part of the result).
*/
template<
typename RandomAccessIterator,
typename Op,
typename Fu,
typename Red,
typename Result>
Op
for_each_template_random_access_ed(RandomAccessIterator begin,
RandomAccessIterator end, Op o, Fu& f,
Red r, Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
for_each_template_random_access_ed(
RandomAccessIterator begin,
RandomAccessIterator end,
Op o, Fu& f, Red r, Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::
difference_type bound)
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::difference_type difference_type;
const difference_type length = end - begin;
const difference_type settings_threads = static_cast<difference_type>(get_max_threads());
const difference_type dmin = settings_threads < length ? settings_threads : length;
const difference_type dmax = dmin > 1 ? dmin : 1;
Result *thread_results;
thread_index_t num_threads = static_cast<thread_index_t>(dmax);
thread_index_t num_threads =
__gnu_parallel::min<difference_type>(get_max_threads(), length);
# pragma omp parallel num_threads(num_threads)
{
# pragma omp single
{
num_threads = omp_get_num_threads();
thread_results = new Result[num_threads];
}
Result *thread_results = new Result[num_threads];
thread_index_t iam = omp_get_thread_num();
#pragma omp parallel num_threads(num_threads)
{
// Neutral element.
Result reduct = Result();
// Neutral element.
Result reduct = Result();
thread_index_t p = num_threads;
thread_index_t iam = omp_get_thread_num();
difference_type start = iam * length / p;
difference_type limit = (iam == p - 1) ? length : (iam + 1) * length / p;
difference_type
start = equally_split_point(length, num_threads, iam),
stop = equally_split_point(length, num_threads, iam + 1);
if (start < limit)
{
reduct = f(o, begin + start);
start++;
}
if (start < stop)
{
reduct = f(o, begin + start);
++start;
}
for (; start < limit; start++)
reduct = r(reduct, f(o, begin + start));
for (; start < stop; ++start)
reduct = r(reduct, f(o, begin + start));
thread_results[iam] = reduct;
}
thread_results[iam] = reduct;
} //parallel
for (thread_index_t i = 0; i < num_threads; i++)
output = r(output, thread_results[i]);
......
......@@ -48,130 +48,156 @@ namespace __gnu_parallel
{
// Problem: there is no 0-element given.
/** @brief Base case prefix sum routine.
* @param begin Begin iterator of input sequence.
* @param end End iterator of input sequence.
* @param result Begin iterator of output sequence.
* @param bin_op Associative binary function.
* @param value Start value. Must be passed since the neutral
* element is unknown in general.
* @return End iterator of output sequence. */
template<typename InputIterator, typename OutputIterator, typename BinaryOperation>
/** @brief Base case prefix sum routine.
* @param begin Begin iterator of input sequence.
* @param end End iterator of input sequence.
* @param result Begin iterator of output sequence.
* @param bin_op Associative binary function.
* @param value Start value. Must be passed since the neutral
* element is unknown in general.
* @return End iterator of output sequence. */
template<
typename InputIterator,
typename OutputIterator,
typename BinaryOperation>
inline OutputIterator
parallel_partial_sum_basecase(InputIterator begin, InputIterator end,
OutputIterator result, BinaryOperation bin_op,
typename std::iterator_traits<InputIterator>::value_type value)
parallel_partial_sum_basecase(
InputIterator begin, InputIterator end,
OutputIterator result, BinaryOperation bin_op,
typename std::iterator_traits<InputIterator>::value_type value)
{
if (begin == end)
return result;
while (begin != end)
{
value = bin_op(value, *begin);
*result = value;
result++;
begin++;
value = bin_op(value, *begin);
*result = value;
result++;
begin++;
}
return result;
}
/** @brief Parallel partial sum implementation, two-phase approach,
no recursion.
* @param begin Begin iterator of input sequence.
* @param end End iterator of input sequence.
* @param result Begin iterator of output sequence.
* @param bin_op Associative binary function.
* @param n Length of sequence.
* @param num_threads Number of threads to use.
* @return End iterator of output sequence.
*/
template<typename InputIterator, typename OutputIterator, typename BinaryOperation>
/** @brief Parallel partial sum implementation, two-phase approach,
no recursion.
* @param begin Begin iterator of input sequence.
* @param end End iterator of input sequence.
* @param result Begin iterator of output sequence.
* @param bin_op Associative binary function.
* @param n Length of sequence.
* @param num_threads Number of threads to use.
* @return End iterator of output sequence.
*/
template<
typename InputIterator,
typename OutputIterator,
typename BinaryOperation>
OutputIterator
parallel_partial_sum_linear(InputIterator begin, InputIterator end,
OutputIterator result, BinaryOperation bin_op,
typename std::iterator_traits<InputIterator>::difference_type n, int num_threads)
parallel_partial_sum_linear(
InputIterator begin, InputIterator end,
OutputIterator result, BinaryOperation bin_op,
typename std::iterator_traits<InputIterator>::difference_type n)
{
typedef std::iterator_traits<InputIterator> traits_type;
typedef typename traits_type::value_type value_type;
typedef typename traits_type::difference_type difference_type;
if (num_threads > (n - 1))
num_threads = static_cast<thread_index_t>(n - 1);
thread_index_t num_threads =
std::min<difference_type>(get_max_threads(), n - 1);
if (num_threads < 2)
{
*result = *begin;
return parallel_partial_sum_basecase(begin + 1, end, result + 1, bin_op, *begin);
*result = *begin;
return parallel_partial_sum_basecase(
begin + 1, end, result + 1, bin_op, *begin);
}
difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 2)));
difference_type* borders;
value_type* sums;
if (Settings::partial_sum_dilatation == 1.0f)
equally_split(n, num_threads + 1, borders);
else
# pragma omp parallel num_threads(num_threads)
{
difference_type chunk_length = (int)((double)n / ((double)num_threads + Settings::partial_sum_dilatation)), borderstart = n - num_threads * chunk_length;
borders[0] = 0;
for (int i = 1; i < (num_threads + 1); i++)
{
borders[i] = borderstart;
borderstart += chunk_length;
}
borders[num_threads + 1] = n;
}
value_type* sums = static_cast<value_type*>(::operator new(sizeof(value_type) * num_threads));
OutputIterator target_end;
#pragma omp parallel num_threads(num_threads)
{
int id = omp_get_thread_num();
if (id == 0)
{
*result = *begin;
parallel_partial_sum_basecase(begin + 1, begin + borders[1],
result + 1, bin_op, *begin);
sums[0] = *(result + borders[1] - 1);
}
else
{
sums[id] = std::accumulate(begin + borders[id] + 1,
begin + borders[id + 1],
*(begin + borders[id]),
bin_op, __gnu_parallel::sequential_tag());
}
#pragma omp barrier
#pragma omp single
parallel_partial_sum_basecase(sums + 1, sums + num_threads, sums + 1,
bin_op, sums[0]);
#pragma omp barrier
// Still same team.
parallel_partial_sum_basecase(begin + borders[id + 1],
begin + borders[id + 2],
result + borders[id + 1], bin_op,
sums[id]);
}
delete [] sums;
# pragma omp single
{
num_threads = omp_get_num_threads();
borders = new difference_type[num_threads + 2];
if (Settings::partial_sum_dilatation == 1.0f)
equally_split(n, num_threads + 1, borders);
else
{
difference_type chunk_length =
((double)n /
((double)num_threads + Settings::partial_sum_dilatation)),
borderstart = n - num_threads * chunk_length;
borders[0] = 0;
for (int i = 1; i < (num_threads + 1); i++)
{
borders[i] = borderstart;
borderstart += chunk_length;
}
borders[num_threads + 1] = n;
}
sums = static_cast<value_type*>(
::operator new(sizeof(value_type) * num_threads));
OutputIterator target_end;
} //single
int iam = omp_get_thread_num();
if (iam == 0)
{
*result = *begin;
parallel_partial_sum_basecase(begin + 1, begin + borders[1],
result + 1, bin_op, *begin);
sums[0] = *(result + borders[1] - 1);
}
else
{
sums[iam] = std::accumulate(begin + borders[iam] + 1,
begin + borders[iam + 1],
*(begin + borders[iam]),
bin_op, __gnu_parallel::sequential_tag());
}
# pragma omp barrier
# pragma omp single
parallel_partial_sum_basecase(
sums + 1, sums + num_threads, sums + 1, bin_op, sums[0]);
# pragma omp barrier
// Still same team.
parallel_partial_sum_basecase(begin + borders[iam + 1],
begin + borders[iam + 2],
result + borders[iam + 1], bin_op,
sums[iam]);
} //parallel
delete[] sums;
delete[] borders;
return result + n;
}
/** @brief Parallel partial sum front-end.
* @param begin Begin iterator of input sequence.
* @param end End iterator of input sequence.
* @param result Begin iterator of output sequence.
* @param bin_op Associative binary function.
* @return End iterator of output sequence. */
template<typename InputIterator, typename OutputIterator, typename BinaryOperation>
/** @brief Parallel partial sum front-end.
* @param begin Begin iterator of input sequence.
* @param end End iterator of input sequence.
* @param result Begin iterator of output sequence.
* @param bin_op Associative binary function.
* @return End iterator of output sequence. */
template<
typename InputIterator,
typename OutputIterator,
typename BinaryOperation>
OutputIterator
parallel_partial_sum(InputIterator begin, InputIterator end,
OutputIterator result, BinaryOperation bin_op)
OutputIterator result, BinaryOperation bin_op)
{
_GLIBCXX_CALL(begin - end);
_GLIBCXX_CALL(begin - end)
typedef std::iterator_traits<InputIterator> traits_type;
typedef typename traits_type::value_type value_type;
......@@ -179,18 +205,15 @@ namespace __gnu_parallel
difference_type n = end - begin;
int num_threads = get_max_threads();
switch (Settings::partial_sum_algorithm)
{
case Settings::LINEAR:
// Need an initial offset.
return parallel_partial_sum_linear(begin, end, result, bin_op,
n, num_threads);
// Need an initial offset.
return parallel_partial_sum_linear(begin, end, result, bin_op, n);
default:
// Partial_sum algorithm not implemented.
_GLIBCXX_PARALLEL_ASSERT(0);
return result + n;
// Partial_sum algorithm not implemented.
_GLIBCXX_PARALLEL_ASSERT(0);
return result + n;
}
}
}
......
......@@ -53,11 +53,17 @@ namespace __gnu_parallel
* this part.
*/
template<typename RandomAccessIterator, typename Comparator>
inline typename std::iterator_traits<RandomAccessIterator>::difference_type
parallel_sort_qs_divide(RandomAccessIterator begin, RandomAccessIterator end,
Comparator comp,
typename std::iterator_traits<RandomAccessIterator>::difference_type pivot_rank,
typename std::iterator_traits<RandomAccessIterator>::difference_type num_samples, thread_index_t num_threads)
inline
typename std::iterator_traits<RandomAccessIterator>::difference_type
parallel_sort_qs_divide(
RandomAccessIterator begin,
RandomAccessIterator end,
Comparator comp,
typename std::iterator_traits<RandomAccessIterator>::difference_type
pivot_rank,
typename std::iterator_traits<RandomAccessIterator>::difference_type
num_samples,
thread_index_t num_threads)
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type;
......@@ -65,20 +71,24 @@ namespace __gnu_parallel
difference_type n = end - begin;
num_samples = std::min(num_samples, n);
value_type* samples = static_cast<value_type*>(__builtin_alloca(sizeof(value_type) * num_samples));
// Allocate uninitialized, to avoid default constructor.
value_type* samples = static_cast<value_type*>(
operator new(num_samples * sizeof(value_type)));
for (difference_type s = 0; s < num_samples; s++)
{
const unsigned long long index = static_cast<unsigned long long>(s)
* n / num_samples;
samples[s] = begin[index];
const unsigned long long index = static_cast<unsigned long long>(s)
* n / num_samples;
new(samples + s) value_type(begin[index]);
}
__gnu_sequential::sort(samples, samples + num_samples, comp);
value_type& pivot = samples[pivot_rank * num_samples / n];
__gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, pivot);
__gnu_parallel::binder2nd<Comparator, value_type, value_type, bool>
pred(comp, pivot);
difference_type split = parallel_partition(begin, end, pred, num_threads);
return split;
......@@ -93,7 +103,10 @@ namespace __gnu_parallel
*/
template<typename RandomAccessIterator, typename Comparator>
inline void
parallel_sort_qs_conquer(RandomAccessIterator begin, RandomAccessIterator end, Comparator comp, int num_threads)
parallel_sort_qs_conquer(RandomAccessIterator begin,
RandomAccessIterator end,
Comparator comp,
thread_index_t num_threads)
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type;
......@@ -101,8 +114,8 @@ namespace __gnu_parallel
if (num_threads <= 1)
{
__gnu_sequential::sort(begin, end, comp);
return;
__gnu_sequential::sort(begin, end, comp);
return;
}
difference_type n = end - begin, pivot_rank;
......@@ -110,24 +123,27 @@ namespace __gnu_parallel
if (n <= 1)
return;
thread_index_t num_processors_left;
thread_index_t num_threads_left;
if ((num_threads % 2) == 1)
num_processors_left = num_threads / 2 + 1;
num_threads_left = num_threads / 2 + 1;
else
num_processors_left = num_threads / 2;
num_threads_left = num_threads / 2;
pivot_rank = n * num_processors_left / num_threads;
pivot_rank = n * num_threads_left / num_threads;
difference_type split = parallel_sort_qs_divide(begin, end, comp, pivot_rank,
Settings::sort_qs_num_samples_preset, num_threads);
difference_type split = parallel_sort_qs_divide(
begin, end, comp, pivot_rank,
Settings::sort_qs_num_samples_preset, num_threads);
#pragma omp parallel sections
{
#pragma omp section
parallel_sort_qs_conquer(begin, begin + split, comp, num_processors_left);
parallel_sort_qs_conquer(begin, begin + split,
comp, num_threads_left);
#pragma omp section
parallel_sort_qs_conquer(begin + split, end, comp, num_threads - num_processors_left);
parallel_sort_qs_conquer(begin + split, end,
comp, num_threads - num_threads_left);
}
}
......@@ -143,9 +159,12 @@ Settings::sort_qs_num_samples_preset, num_threads);
*/
template<typename RandomAccessIterator, typename Comparator>
inline void
parallel_sort_qs(RandomAccessIterator begin, RandomAccessIterator end,
Comparator comp,
typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads)
parallel_sort_qs(
RandomAccessIterator begin,
RandomAccessIterator end,
Comparator comp,
typename std::iterator_traits<RandomAccessIterator>::difference_type n,
int num_threads)
{
_GLIBCXX_CALL(n)
......@@ -165,12 +184,9 @@ Settings::sort_qs_num_samples_preset, num_threads);
// Hard to avoid.
omp_set_num_threads(num_threads);
bool old_nested = (omp_get_nested() != 0);
omp_set_nested(true);
parallel_sort_qs_conquer(begin, begin + n, comp, num_threads);
omp_set_nested(old_nested);
}
} //namespace __gnu_parallel
} //namespace __gnu_parallel
#endif
......@@ -53,10 +53,10 @@ namespace __gnu_parallel
* @param length Length of sequence to search for.
* @param advances Returned offsets.
*/
template<typename RandomAccessIterator, typename _DifferenceTp>
template<typename RandomAccessIterator, typename _DifferenceTp>
void
calc_borders(RandomAccessIterator elements, _DifferenceTp length,
_DifferenceTp* off)
_DifferenceTp* off)
{
typedef _DifferenceTp difference_type;
......@@ -66,9 +66,9 @@ namespace __gnu_parallel
difference_type k = 0;
for (difference_type j = 2; j <= length; j++)
{
while ((k >= 0) && !(elements[k] == elements[j-1]))
k = off[k];
off[j] = ++k;
while ((k >= 0) && !(elements[k] == elements[j-1]))
k = off[k];
off[j] = ++k;
}
}
......@@ -81,11 +81,14 @@ namespace __gnu_parallel
* @param end2 End iterator of second sequence.
* @param pred Find predicate.
* @return Place of finding in first sequences. */
template<typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename Pred>
template<
typename _RandomAccessIterator1,
typename _RandomAccessIterator2,
typename Pred>
_RandomAccessIterator1
search_template(_RandomAccessIterator1 begin1, _RandomAccessIterator1 end1,
_RandomAccessIterator2 begin2, _RandomAccessIterator2 end2,
Pred pred)
_RandomAccessIterator2 begin2, _RandomAccessIterator2 end2,
Pred pred)
{
typedef std::iterator_traits<_RandomAccessIterator1> traits_type;
typedef typename traits_type::difference_type difference_type;
......@@ -103,60 +106,71 @@ namespace __gnu_parallel
// Where is first occurrence of pattern? defaults to end.
difference_type result = (end1 - begin1);
difference_type *splitters;
// Pattern too long.
if (input_length < 0)
return end1;
thread_index_t num_threads = std::max<difference_type>(1, std::min<difference_type>(input_length, __gnu_parallel::get_max_threads()));
omp_lock_t result_lock;
omp_init_lock(&result_lock);
difference_type borders[num_threads + 1];
__gnu_parallel::equally_split(input_length, num_threads, borders);
thread_index_t num_threads =
std::max<difference_type>(1,
std::min<difference_type>(input_length, get_max_threads()));
difference_type advances[pattern_length];
calc_borders(begin2, pattern_length, advances);
#pragma omp parallel num_threads(num_threads)
{
thread_index_t iam = omp_get_thread_num();
difference_type start = borders[iam], stop = borders[iam + 1];
difference_type pos_in_pattern = 0;
bool found_pattern = false;
while (start <= stop && !found_pattern)
{
// Get new value of result.
#pragma omp flush(result)
// No chance for this thread to find first occurrence.
if (result < start)
break;
while (pred(begin1[start + pos_in_pattern], begin2[pos_in_pattern]))
{
++pos_in_pattern;
if (pos_in_pattern == pattern_length)
{
// Found new candidate for result.
omp_set_lock(&result_lock);
result = std::min(result, start);
omp_unset_lock(&result_lock);
found_pattern = true;
break;
}
}
// Make safe jump.
start += (pos_in_pattern - advances[pos_in_pattern]);
pos_in_pattern = (advances[pos_in_pattern] < 0) ? 0 : advances[pos_in_pattern];
}
}
# pragma omp parallel num_threads(num_threads)
{
# pragma omp single
{
num_threads = omp_get_num_threads();
splitters = new difference_type[num_threads + 1];
equally_split(input_length, num_threads, splitters);
}
thread_index_t iam = omp_get_thread_num();
difference_type start = splitters[iam], stop = splitters[iam + 1];
difference_type pos_in_pattern = 0;
bool found_pattern = false;
while (start <= stop && !found_pattern)
{
// Get new value of result.
#pragma omp flush(result)
// No chance for this thread to find first occurrence.
if (result < start)
break;
while (pred(begin1[start + pos_in_pattern],
begin2[pos_in_pattern]))
{
++pos_in_pattern;
if (pos_in_pattern == pattern_length)
{
// Found new candidate for result.
omp_set_lock(&result_lock);
result = std::min(result, start);
omp_unset_lock(&result_lock);
found_pattern = true;
break;
}
}
// Make safe jump.
start += (pos_in_pattern - advances[pos_in_pattern]);
pos_in_pattern =
(advances[pos_in_pattern] < 0) ? 0 : advances[pos_in_pattern];
}
} //parallel
omp_destroy_lock(&result_lock);
delete[] splitters;
// Return iterator on found element.
return (begin1 + result);
}
......
......@@ -44,16 +44,19 @@
namespace __gnu_parallel
{
/** @brief Parallel std::unique_copy(), without explicit equality predicate.
* @param first Begin iterator of input sequence.
* @param last End iterator of input sequence.
* @param result Begin iterator of result sequence.
* @param binary_pred Equality predicate.
* @return End iterator of result sequence. */
template<typename InputIterator, class OutputIterator, class BinaryPredicate>
/** @brief Parallel std::unique_copy(), w/o explicit equality predicate.
* @param first Begin iterator of input sequence.
* @param last End iterator of input sequence.
* @param result Begin iterator of result sequence.
* @param binary_pred Equality predicate.
* @return End iterator of result sequence. */
template<
typename InputIterator,
class OutputIterator,
class BinaryPredicate>
inline OutputIterator
parallel_unique_copy(InputIterator first, InputIterator last,
OutputIterator result, BinaryPredicate binary_pred)
OutputIterator result, BinaryPredicate binary_pred)
{
_GLIBCXX_CALL(last - first)
......@@ -62,126 +65,136 @@ namespace __gnu_parallel
typedef typename traits_type::difference_type difference_type;
difference_type size = last - first;
int num_threads = __gnu_parallel::get_max_threads();
difference_type counter[num_threads + 1];
if (size == 0)
return result;
// Let the first thread process two parts.
difference_type borders[num_threads + 2];
__gnu_parallel::equally_split(size, num_threads + 1, borders);
difference_type *counter;
difference_type *borders;
thread_index_t num_threads = get_max_threads();
// First part contains at least one element.
#pragma omp parallel num_threads(num_threads)
{
int iam = omp_get_thread_num();
difference_type begin, end;
// Check for length without duplicates
// Needed for position in output
difference_type i = 0;
OutputIterator out = result;
if (iam == 0)
{
begin = borders[0] + 1; // == 1
end = borders[iam + 1];
i++;
new (static_cast<void *>(&*out)) value_type(*first);
out++;
for (InputIterator iter = first + begin; iter < first + end; ++iter)
{
if (!binary_pred(*iter, *(iter-1)))
{
i++;
new (static_cast<void *>(&*out)) value_type(*iter);
out++;
}
}
}
# pragma omp parallel num_threads(num_threads)
{
# pragma omp single
{
num_threads = omp_get_num_threads();
borders = new difference_type[num_threads + 2];
equally_split(size, num_threads + 1, borders);
counter = new difference_type[num_threads + 1];
}
thread_index_t iam = omp_get_thread_num();
difference_type begin, end;
// Check for length without duplicates
// Needed for position in output
difference_type i = 0;
OutputIterator out = result;
if (iam == 0)
{
begin = borders[0] + 1; // == 1
end = borders[iam + 1];
i++;
new (static_cast<void *>(&*out)) value_type(*first);
out++;
for (InputIterator iter = first + begin; iter < first + end; ++iter)
{
if (!binary_pred(*iter, *(iter-1)))
{
i++;
new (static_cast<void *>(&*out)) value_type(*iter);
out++;
}
}
}
else
{
begin = borders[iam]; //one part
end = borders[iam + 1];
for (InputIterator iter = first + begin; iter < first + end; ++iter)
{
if (!binary_pred(*iter, *(iter-1)))
{
i++;
}
}
}
{
begin = borders[iam]; //one part
end = borders[iam + 1];
for (InputIterator iter = first + begin; iter < first + end; ++iter)
{
if (!binary_pred(*iter, *(iter-1)))
{
i++;
}
}
}
counter[iam] = i;
// Last part still untouched.
difference_type begin_output;
#pragma omp barrier
# pragma omp barrier
// Store result in output on calculated positions.
begin_output = 0;
if (iam == 0)
{
for (int t = 0; t < num_threads; t++)
begin_output += counter[t];
{
for (int t = 0; t < num_threads; t++)
begin_output += counter[t];
i = 0;
i = 0;
OutputIterator iter_out = result + begin_output;
OutputIterator iter_out = result + begin_output;
begin = borders[num_threads];
end = size;
begin = borders[num_threads];
end = size;
for (InputIterator iter = first + begin; iter < first + end; ++iter)
{
if (iter == first || !binary_pred(*iter, *(iter-1)))
{
i++;
new (static_cast<void *>(&*iter_out)) value_type(*iter);
iter_out++;
}
}
for (InputIterator iter = first + begin; iter < first + end; ++iter)
{
if (iter == first || !binary_pred(*iter, *(iter-1)))
{
i++;
new (static_cast<void *>(&*iter_out)) value_type(*iter);
iter_out++;
}
}
counter[num_threads] = i;
}
counter[num_threads] = i;
}
else
{
for (int t = 0; t < iam; t++)
begin_output += counter[t];
OutputIterator iter_out = result + begin_output;
for (InputIterator iter = first + begin; iter < first + end; ++iter)
{
if (!binary_pred(*iter, *(iter-1)))
{
new (static_cast<void *> (&*iter_out)) value_type(*iter);
iter_out++;
}
}
}
{
for (int t = 0; t < iam; t++)
begin_output += counter[t];
OutputIterator iter_out = result + begin_output;
for (InputIterator iter = first + begin; iter < first + end; ++iter)
{
if (!binary_pred(*iter, *(iter-1)))
{
new (static_cast<void *> (&*iter_out)) value_type(*iter);
iter_out++;
}
}
}
}
difference_type end_output = 0;
for (int t = 0; t < num_threads + 1; t++)
end_output += counter[t];
delete[] borders;
return result + end_output;
}
/** @brief Parallel std::unique_copy(), without explicit equality predicate
* @param first Begin iterator of input sequence.
* @param last End iterator of input sequence.
* @param result Begin iterator of result sequence.
* @return End iterator of result sequence. */
template<typename InputIterator, class OutputIterator>
/** @brief Parallel std::unique_copy(), without explicit equality predicate
* @param first Begin iterator of input sequence.
* @param last End iterator of input sequence.
* @param result Begin iterator of result sequence.
* @return End iterator of result sequence. */
template<typename InputIterator, class OutputIterator>
inline OutputIterator
parallel_unique_copy(InputIterator first, InputIterator last,
OutputIterator result)
OutputIterator result)
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment