re PR libstdc++/33893 ([parallel mode] Algorithms rely on omp_set_dynamic(false))

2007-11-22 Johannes Singler <singler@ira.uka.de> PR libstdc++/33893 * include/parallel/multiway_merge.h: made omp_dynamic-safe * include/parallel/workstealing.h: made omp_dynamic-safe * include/parallel/base.h: infrastructure, cleanup * include/parallel/par_loop.h: made omp_dynamic-safe * include/parallel/features.h: activate loser tree variant * include/parallel/quicksort.h: made omp_dynamic-safe * include/parallel/compiletime_settings.h: settings overridable * include/parallel/equally_split.h: made omp_dynamic-safe * include/parallel/omp_loop_static.h: made omp_dynamic-safe * include/parallel/random_shuffle.h: made omp_dynamic-safe * include/parallel/balanced_quicksort.h: made omp_dynamic-safe * include/parallel/set_operations.h: made omp_dynamic-safe * include/parallel/unique_copy.h: made omp_dynamic-safe * include/parallel/multiway_mergesort.h: made omp_dynamic-safe * include/parallel/search.h: made omp_dynamic-safe * include/parallel/partition.h: made omp_dynamic-safe * include/parallel/partial_sum.h: made omp_dynamic-safe * include/parallel/find.h: made omp_dynamic-safe * include/parallel/omp_loop.h: made omp_dynamic-safe * include/parallel/losertree.h: avoid default constructor From-SVN: r130347

re PR libstdc++/33893 ([parallel mode] Algorithms rely on omp_set_dynamic(false))
2007-11-22 Johannes Singler <singler@ira.uka.de> PR libstdc++/33893 * include/parallel/multiway_merge.h: made omp_dynamic-safe * include/parallel/workstealing.h: made omp_dynamic-safe * include/parallel/base.h: infrastructure, cleanup * include/parallel/par_loop.h: made omp_dynamic-safe * include/parallel/features.h: activate loser tree variant * include/parallel/quicksort.h: made omp_dynamic-safe * include/parallel/compiletime_settings.h: settings overridable * include/parallel/equally_split.h: made omp_dynamic-safe * include/parallel/omp_loop_static.h: made omp_dynamic-safe * include/parallel/random_shuffle.h: made omp_dynamic-safe * include/parallel/balanced_quicksort.h: made omp_dynamic-safe * include/parallel/set_operations.h: made omp_dynamic-safe * include/parallel/unique_copy.h: made omp_dynamic-safe * include/parallel/multiway_mergesort.h: made omp_dynamic-safe * include/parallel/search.h: made omp_dynamic-safe * include/parallel/partition.h: made omp_dynamic-safe * include/parallel/partial_sum.h: made omp_dynamic-safe * include/parallel/find.h: made omp_dynamic-safe * include/parallel/omp_loop.h: made omp_dynamic-safe * include/parallel/losertree.h: avoid default constructor From-SVN: r130347
e683ee2a · Johannes Singler · Johannes Singler · 7861a5ce · e683ee2a · e683ee2a
Commit e683ee2a authored Nov 22, 2007 by Johannes Singler Committed by Johannes Singler Nov 22, 2007
21 changed files
--- a/libstdc++-v3/ChangeLog
+++ b/libstdc++-v3/ChangeLog
+2007-11-22  Johannes Singler  <singler@ira.uka.de>
+	PR libstdc++/33893
+        * include/parallel/multiway_merge.h: made omp_dynamic-safe
+        * include/parallel/workstealing.h: made omp_dynamic-safe
+        * include/parallel/base.h: infrastructure, cleanup
+        * include/parallel/par_loop.h: made omp_dynamic-safe
+        * include/parallel/features.h: activate loser tree variant
+        * include/parallel/quicksort.h: made omp_dynamic-safe
+        * include/parallel/compiletime_settings.h: settings overridable
+        * include/parallel/equally_split.h: made omp_dynamic-safe
+        * include/parallel/omp_loop_static.h: made omp_dynamic-safe
+        * include/parallel/random_shuffle.h: made omp_dynamic-safe
+        * include/parallel/balanced_quicksort.h: made omp_dynamic-safe
+        * include/parallel/set_operations.h: made omp_dynamic-safe
+        * include/parallel/unique_copy.h: made omp_dynamic-safe
+        * include/parallel/multiway_mergesort.h: made omp_dynamic-safe
+        * include/parallel/search.h: made omp_dynamic-safe
+        * include/parallel/partition.h: made omp_dynamic-safe
+        * include/parallel/partial_sum.h: made omp_dynamic-safe
+        * include/parallel/find.h: made omp_dynamic-safe
+        * include/parallel/omp_loop.h: made omp_dynamic-safe
+        * include/parallel/losertree.h: avoid default constructor
 2007-11-21  Jonathan Wakely  <jwakely.gcc@gmail.com>
 	* docs/html/17_intro/C++STYLE: Fix typos.

--- a/libstdc++-v3/include/parallel/balanced_quicksort.h
+++ b/libstdc++-v3/include/parallel/balanced_quicksort.h
--- a/libstdc++-v3/include/parallel/base.h
+++ b/libstdc++-v3/include/parallel/base.h
--- a/libstdc++-v3/include/parallel/compiletime_settings.h
+++ b/libstdc++-v3/include/parallel/compiletime_settings.h
@@ -39,7 +39,7 @@
 #include <cstdio>
 /** @brief Determine verbosity level of the parallel mode.
- *  Level 1 prints a message each time when entering a parallel-mode function. */
+ *  Level 1 prints a message each time a parallel-mode function is entered. */
 #define _GLIBCXX_VERBOSE_LEVEL 0
 /** @def _GLIBCXX_CALL
@@ -50,27 +50,40 @@
 #define _GLIBCXX_CALL(n)
 #endif
 #if (_GLIBCXX_VERBOSE_LEVEL == 1)
-#define _GLIBCXX_CALL(n) printf("   %s:\niam = %d, n = %ld, num_threads = %d\n", __PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads());
+#define _GLIBCXX_CALL(n) \
+  printf("   %s:\niam = %d, n = %ld, num_threads = %d\n", \
+  __PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads());
 #endif
+#ifndef _GLIBCXX_SCALE_DOWN_FPU
 /** @brief Use floating-point scaling instead of modulo for mapping
 *  random numbers to a range.  This can be faster on certain CPUs. */
 #define _GLIBCXX_SCALE_DOWN_FPU 0
+#endif
+#ifndef _GLIBCXX_ASSERTIONS
 /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
 *  Should be switched on only locally. */
 #define _GLIBCXX_ASSERTIONS 0
+#endif
+#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
 /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
- *  Consider the size of the L1 cache for __gnu_parallel::parallel_random_shuffle(). */
+ *  Consider the size of the L1 cache for
+ *  __gnu_parallel::parallel_random_shuffle(). */
 #define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 0
+#endif
+#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 
 /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
- *  Consider the size of the TLB for __gnu_parallel::parallel_random_shuffle(). */
+ *  Consider the size of the TLB for
+ *  __gnu_parallel::parallel_random_shuffle(). */
 #define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 0
+#endif
+#ifndef _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
 /** @brief First copy the data, sort it locally, and merge it back
 * (0); or copy it back after everything is done (1).
 *
 *  Recommendation: 0 */
 #define _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST 0
+#endif
--- a/libstdc++-v3/include/parallel/equally_split.h
+++ b/libstdc++-v3/include/parallel/equally_split.h
@@ -39,30 +39,58 @@
 namespace __gnu_parallel
 {
-  /** @brief Function to split a sequence into parts of almost equal size.
+/** @brief Function to split a sequence into parts of almost equal size.
 *
-   *  The resulting sequence s of length p+1 contains the splitting
+ *  The resulting sequence s of length num_threads+1 contains the splitting
 *  positions when splitting the range [0,n) into parts of almost
 *  equal size (plus minus 1).  The first entry is 0, the last one
 *  n. There may result empty parts.
 *  @param n Number of elements
-   *  @param p Number of parts
+ *  @param num_threads Number of parts
 *  @param s Splitters
-   *  @returns End of splitter sequence, i. e. @c s+p+1 */
+ *  @returns End of splitter sequence, i. e. @c s+num_threads+1 */
-  template<typename _DifferenceTp, typename OutputIterator>
+template<typename difference_type, typename OutputIterator>
  OutputIterator
-  equally_split(_DifferenceTp n, thread_index_t p, OutputIterator s)
+  equally_split(difference_type n,
+                thread_index_t num_threads,
+                OutputIterator s)
  {
-    typedef _DifferenceTp difference_type;
+    difference_type chunk_length = n / num_threads,
-    difference_type chunk_length = n / p, split = n % p, start = 0;
+                    num_longer_chunks = n % num_threads,
-    for (int i = 0; i < p; i++)
+                    pos = 0;
+    for (thread_index_t i = 0; i < num_threads; ++i)
      {
-	*s++ = start;
+        *s++ = pos;
-	start += (difference_type(i) < split) ? (chunk_length + 1) : chunk_length;
+        pos += (i < num_longer_chunks) ? (chunk_length + 1) : chunk_length;
      }
    *s++ = n;
    return s;
  }
+/** @brief Function to split a sequence into parts of almost equal size.
+ *
+ *  Returns the position of the splitting point between
+ *  thread number thread_no (included) and
+ *  thread number thread_no+1 (excluded).
+ *  @param n Number of elements
+ *  @param num_threads Number of parts
+ *  @returns Splitting point */
+template<typename difference_type>
+  difference_type
+  equally_split_point(difference_type n,
+                      thread_index_t num_threads,
+                      thread_index_t thread_no)
+  {
+    difference_type chunk_length = n / num_threads,
+                    num_longer_chunks = n % num_threads;
+    if(thread_no < num_longer_chunks)
+      return thread_no * (chunk_length + 1);
+    else
+      return num_longer_chunks * (chunk_length + 1)
+          + (thread_no - num_longer_chunks) * chunk_length;
+  }
 }
 #endif
--- a/libstdc++-v3/include/parallel/features.h
+++ b/libstdc++-v3/include/parallel/features.h
@@ -66,7 +66,7 @@
 *  @brief Include guarded (sequences may run empty) loser tree,
 *  moving objects.
 *  @see __gnu_parallel::Settings multiway_merge_algorithm */
-#define _GLIBCXX_LOSER_TREE 0
+#define _GLIBCXX_LOSER_TREE 1
 #endif
 #ifndef _GLIBCXX_LOSER_TREE_EXPLICIT

--- a/libstdc++-v3/include/parallel/find.h
+++ b/libstdc++-v3/include/parallel/find.h
--- a/libstdc++-v3/include/parallel/losertree.h
+++ b/libstdc++-v3/include/parallel/losertree.h
--- a/libstdc++-v3/include/parallel/multiway_merge.h
+++ b/libstdc++-v3/include/parallel/multiway_merge.h
--- a/libstdc++-v3/include/parallel/multiway_mergesort.h
+++ b/libstdc++-v3/include/parallel/multiway_mergesort.h
--- a/libstdc++-v3/include/parallel/omp_loop.h
+++ b/libstdc++-v3/include/parallel/omp_loop.h
@@ -43,10 +43,11 @@
 #include <parallel/settings.h>
 #include <parallel/basic_iterator.h>
+#include <parallel/base.h>
 namespace __gnu_parallel
 {
-  /** @brief Embarrassingly parallel algorithm for random access
+/** @brief Embarrassingly parallel algorithm for random access
  * iterators, using an OpenMP for loop.
  *
  *  @param begin Begin iterator of element sequence.
@@ -63,34 +64,50 @@ namespace __gnu_parallel
  *  std::count_n()).
  *  @return User-supplied functor (that may contain a part of the result).
  */
-  template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
+template<typename RandomAccessIterator,
+            typename Op,
+            typename Fu,
+            typename Red,
+            typename Result>
  Op
-  for_each_template_random_access_omp_loop(RandomAccessIterator begin, RandomAccessIterator end, Op o, Fu& f, Red r, Result base, Result& output, typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
+  for_each_template_random_access_omp_loop(
+             RandomAccessIterator begin,
+             RandomAccessIterator end,
+             Op o, Fu& f, Red r, Result base, Result& output,
+             typename std::iterator_traits<RandomAccessIterator>::
+                 difference_type bound)
  {
-    typedef typename std::iterator_traits<RandomAccessIterator>::difference_type difference_type;
+    typedef typename
+        std::iterator_traits<RandomAccessIterator>::difference_type
+        difference_type;
-    thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : static_cast<thread_index_t>((end - begin));
-    Result *thread_results = new Result[num_threads];
    difference_type length = end - begin;
+    thread_index_t num_threads =
+        __gnu_parallel::min<difference_type>(get_max_threads(), length);
-    for (thread_index_t i = 0; i < num_threads; i++)
+    Result *thread_results;
-      {
-	thread_results[i] = r(thread_results[i], f(o, begin+i));
-      }
-#pragma omp parallel num_threads(num_threads)
+#   pragma omp parallel num_threads(num_threads)
      {
-#pragma omp for schedule(dynamic, Settings::workstealing_chunk_size)
+#       pragma omp single
-      for (difference_type pos = 0; pos < length; pos++)
          {
-	  thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos));
+            num_threads = omp_get_num_threads();
-	}
+            thread_results = new Result[num_threads];
+            for (thread_index_t i = 0; i < num_threads; i++)
+              thread_results[i] = Result();
          }
+        thread_index_t iam = omp_get_thread_num();
+#       pragma omp for schedule(dynamic, Settings::workstealing_chunk_size)
+        for (difference_type pos = 0; pos < length; pos++)
+          thread_results[iam] =
+              r(thread_results[iam], f(o, begin+pos));
+      } //parallel
    for (thread_index_t i = 0; i < num_threads; i++)
-      {
        output = r(output, thread_results[i]);
-      }
    delete [] thread_results;
@@ -100,6 +117,7 @@ namespace __gnu_parallel
    return o;
  }
 } // end namespace
 #endif
--- a/libstdc++-v3/include/parallel/omp_loop_static.h
+++ b/libstdc++-v3/include/parallel/omp_loop_static.h
@@ -64,39 +64,50 @@ namespace __gnu_parallel
   *  std::count_n()).
   *  @return User-supplied functor (that may contain a part of the result).
   */
-  template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
+template<typename RandomAccessIterator,
+          typename Op,
+          typename Fu,
+          typename Red,
+          typename Result>
  Op
-  for_each_template_random_access_omp_loop_static(RandomAccessIterator begin,
+  for_each_template_random_access_omp_loop_static(
+              RandomAccessIterator begin,
              RandomAccessIterator end,
-						  Op o, Fu& f, Red r,
+              Op o, Fu& f, Red r, Result base, Result& output,
-						  Result base, Result& output,
+              typename std::iterator_traits<RandomAccessIterator>::
-						  typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
+                  difference_type bound)
  {
-    typedef std::iterator_traits<RandomAccessIterator> traits_type;
+    typedef typename
-    typedef typename traits_type::difference_type difference_type;
+        std::iterator_traits<RandomAccessIterator>::difference_type
+        difference_type;
-    thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : (end - begin);
-    Result *thread_results = new Result[num_threads];
    difference_type length = end - begin;
+    thread_index_t num_threads =
+        std::min<difference_type>(get_max_threads(), length);
-    for (thread_index_t i = 0; i < num_threads; i++)
+    Result *thread_results;
-      {
-	thread_results[i] = r(thread_results[i], f(o, begin+i));
-      }
-#pragma omp parallel num_threads(num_threads)
+#   pragma omp parallel num_threads(num_threads)
      {
-#pragma omp for schedule(static, Settings::workstealing_chunk_size)
+#       pragma omp single
-      for (difference_type pos = 0; pos < length; pos++)
          {
-	  thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos));
+            num_threads = omp_get_num_threads();
-	}
+            thread_results = new Result[num_threads];
+            for (thread_index_t i = 0; i < num_threads; i++)
+              thread_results[i] = Result();
          }
+        thread_index_t iam = omp_get_thread_num();
+#       pragma omp for schedule(static, Settings::workstealing_chunk_size)
+        for (difference_type pos = 0; pos < length; pos++)
+          thread_results[iam] =
+              r(thread_results[iam], f(o, begin+pos));
+      } //parallel
    for (thread_index_t i = 0; i < num_threads; i++)
-      {
        output = r(output, thread_results[i]);
-      }
    delete [] thread_results;
@@ -106,6 +117,7 @@ namespace __gnu_parallel
    return o;
  }
 } // end namespace
 #endif
--- a/libstdc++-v3/include/parallel/par_loop.h
+++ b/libstdc++-v3/include/parallel/par_loop.h
@@ -41,11 +41,12 @@
 #include <omp.h>
 #include <parallel/settings.h>
+#include <parallel/base.h>
 namespace __gnu_parallel
 {
-  /** @brief Embarrassingly parallel algorithm for random access
+/** @brief Embarrassingly parallel algorithm for random access
  * iterators, using hand-crafted parallelization by equal splitting
  * the work.
  *
@@ -63,47 +64,57 @@ namespace __gnu_parallel
  *  std::count_n()).
  *  @return User-supplied functor (that may contain a part of the result).
  */
-  template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
+template<
+    typename RandomAccessIterator,
+    typename Op,
+    typename Fu,
+    typename Red,
+    typename Result>
  Op
-  for_each_template_random_access_ed(RandomAccessIterator begin,
+  for_each_template_random_access_ed(
-				     RandomAccessIterator end, Op o, Fu& f,
+              RandomAccessIterator begin,
-				     Red r, Result base, Result& output,
+              RandomAccessIterator end,
-				     typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
+              Op o, Fu& f, Red r, Result base, Result& output,
+              typename std::iterator_traits<RandomAccessIterator>::
+                  difference_type bound)
  {
    typedef std::iterator_traits<RandomAccessIterator> traits_type;
    typedef typename traits_type::difference_type difference_type;
    const difference_type length = end - begin;
-    const difference_type settings_threads = static_cast<difference_type>(get_max_threads());
+    Result *thread_results;
-    const difference_type dmin = settings_threads < length ? settings_threads : length;
-    const difference_type dmax = dmin > 1 ? dmin : 1;
-    thread_index_t num_threads = static_cast<thread_index_t>(dmax);
+    thread_index_t num_threads =
+        __gnu_parallel::min<difference_type>(get_max_threads(), length);
+#   pragma omp parallel num_threads(num_threads)
+      {
+#       pragma omp single
+          {
+            num_threads = omp_get_num_threads();
+            thread_results = new Result[num_threads];
+          }
-    Result *thread_results = new Result[num_threads];
+        thread_index_t iam = omp_get_thread_num();
-#pragma omp parallel num_threads(num_threads)
-    {
        // Neutral element.
        Result reduct = Result();
-      thread_index_t p = num_threads;
+        difference_type
-      thread_index_t iam = omp_get_thread_num();
+            start = equally_split_point(length, num_threads, iam),
-      difference_type start = iam * length / p;
+            stop = equally_split_point(length, num_threads, iam + 1);
-      difference_type limit = (iam == p - 1) ? length : (iam + 1) * length / p;
-      if (start < limit)
+        if (start < stop)
          {
            reduct = f(o, begin + start);
-	  start++;
+            ++start;
          }
-      for (; start < limit; start++)
+        for (; start < stop; ++start)
          reduct = r(reduct, f(o, begin + start));
        thread_results[iam] = reduct;
-    }
+      } //parallel
    for (thread_index_t i = 0; i < num_threads; i++)
      output = r(output, thread_results[i]);

--- a/libstdc++-v3/include/parallel/partial_sum.h
+++ b/libstdc++-v3/include/parallel/partial_sum.h
@@ -48,7 +48,7 @@ namespace __gnu_parallel
 {
  // Problem: there is no 0-element given.
-  /** @brief Base case prefix sum routine.
+/** @brief Base case prefix sum routine.
  *  @param begin Begin iterator of input sequence.
  *  @param end End iterator of input sequence.
  *  @param result Begin iterator of output sequence.
@@ -56,9 +56,13 @@ namespace __gnu_parallel
  *  @param value Start value. Must be passed since the neutral
  *  element is unknown in general.
  *  @return End iterator of output sequence. */
-  template<typename InputIterator, typename OutputIterator, typename BinaryOperation>
+template<
+    typename InputIterator,
+    typename OutputIterator,
+    typename BinaryOperation>
  inline OutputIterator
-  parallel_partial_sum_basecase(InputIterator begin, InputIterator end,
+  parallel_partial_sum_basecase(
+            InputIterator begin, InputIterator end,
            OutputIterator result, BinaryOperation bin_op,
            typename std::iterator_traits<InputIterator>::value_type value)
  {
@@ -75,7 +79,7 @@ namespace __gnu_parallel
    return result;
  }
-  /** @brief Parallel partial sum implementation, two-phase approach,
+/** @brief Parallel partial sum implementation, two-phase approach,
    no recursion.
    *  @param begin Begin iterator of input sequence.
    *  @param end End iterator of input sequence.
@@ -85,31 +89,49 @@ namespace __gnu_parallel
    *  @param num_threads Number of threads to use.
    *  @return End iterator of output sequence.
    */
-  template<typename InputIterator, typename OutputIterator, typename BinaryOperation>
+template<
+    typename InputIterator,
+    typename OutputIterator,
+    typename BinaryOperation>
  OutputIterator
-  parallel_partial_sum_linear(InputIterator begin, InputIterator end,
+  parallel_partial_sum_linear(
+            InputIterator begin, InputIterator end,
            OutputIterator result, BinaryOperation bin_op,
-			      typename std::iterator_traits<InputIterator>::difference_type n, int num_threads)
+            typename std::iterator_traits<InputIterator>::difference_type n)
  {
    typedef std::iterator_traits<InputIterator> traits_type;
    typedef typename traits_type::value_type value_type;
    typedef typename traits_type::difference_type difference_type;
-    if (num_threads > (n - 1))
+    thread_index_t num_threads =
-      num_threads = static_cast<thread_index_t>(n - 1);
+        std::min<difference_type>(get_max_threads(), n - 1);
    if (num_threads < 2)
      {
        *result = *begin;
-	return parallel_partial_sum_basecase(begin + 1, end, result + 1, bin_op, *begin);
+        return parallel_partial_sum_basecase(
+            begin + 1, end, result + 1, bin_op, *begin);
      }
-    difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 2)));
+    difference_type* borders;
+    value_type* sums;
+#   pragma omp parallel num_threads(num_threads)
+      {
+#       pragma omp single
+          {
+            num_threads = omp_get_num_threads();
+            borders = new difference_type[num_threads + 2];
            if (Settings::partial_sum_dilatation == 1.0f)
              equally_split(n, num_threads + 1, borders);
            else
              {
-	difference_type chunk_length = (int)((double)n / ((double)num_threads + Settings::partial_sum_dilatation)), borderstart = n - num_threads * chunk_length;
+                difference_type chunk_length =
+                    ((double)n /
+                    ((double)num_threads + Settings::partial_sum_dilatation)),
+                    borderstart = n - num_threads * chunk_length;
                borders[0] = 0;
                for (int i = 1; i < (num_threads + 1); i++)
                  {
@@ -119,13 +141,13 @@ namespace __gnu_parallel
                borders[num_threads + 1] = n;
              }
-    value_type* sums = static_cast<value_type*>(::operator new(sizeof(value_type) * num_threads));
+            sums = static_cast<value_type*>(
+                ::operator new(sizeof(value_type) * num_threads));
            OutputIterator target_end;
+          } //single
-#pragma omp parallel num_threads(num_threads)
+        int iam = omp_get_thread_num();
-    {
+        if (iam == 0)
-      int id = omp_get_thread_num();
-      if (id == 0)
          {
            *result = *begin;
            parallel_partial_sum_basecase(begin + 1, begin + borders[1],
@@ -134,44 +156,48 @@ namespace __gnu_parallel
          }
        else
          {
-	  sums[id] = std::accumulate(begin + borders[id] + 1, 
+            sums[iam] = std::accumulate(begin + borders[iam] + 1,
-				     begin + borders[id + 1], 
+                            begin + borders[iam + 1],
-				     *(begin + borders[id]), 
+                            *(begin + borders[iam]),
                            bin_op, __gnu_parallel::sequential_tag());
          }
-#pragma omp barrier
+#       pragma omp barrier
-#pragma omp single
+#       pragma omp single
-      parallel_partial_sum_basecase(sums + 1, sums + num_threads, sums + 1, 
+          parallel_partial_sum_basecase(
-				    bin_op, sums[0]);
+              sums + 1, sums + num_threads, sums + 1, bin_op, sums[0]);
-#pragma omp barrier
+#       pragma omp barrier
        // Still same team.
-      parallel_partial_sum_basecase(begin + borders[id + 1], 
+        parallel_partial_sum_basecase(begin + borders[iam + 1],
-				    begin + borders[id + 2], 
+                      begin + borders[iam + 2],
-				    result + borders[id + 1], bin_op, 
+                      result + borders[iam + 1], bin_op,
-				    sums[id]);
+                      sums[iam]);
-    }
+      } //parallel
-    delete [] sums;
+    delete[] sums;
+    delete[] borders;
    return result + n;
  }
-  /** @brief Parallel partial sum front-end.
+/** @brief Parallel partial sum front-end.
  *  @param begin Begin iterator of input sequence.
  *  @param end End iterator of input sequence.
  *  @param result Begin iterator of output sequence.
  *  @param bin_op Associative binary function.
  *  @return End iterator of output sequence. */
-  template<typename InputIterator, typename OutputIterator, typename BinaryOperation>
+template<
+    typename InputIterator,
+    typename OutputIterator,
+    typename BinaryOperation>
  OutputIterator
  parallel_partial_sum(InputIterator begin, InputIterator end,
                       OutputIterator result, BinaryOperation bin_op)
  {
-    _GLIBCXX_CALL(begin - end);
+    _GLIBCXX_CALL(begin - end)
    typedef std::iterator_traits<InputIterator> traits_type;
    typedef typename traits_type::value_type value_type;
@@ -179,14 +205,11 @@ namespace __gnu_parallel
    difference_type n = end - begin;
-    int num_threads = get_max_threads();
    switch (Settings::partial_sum_algorithm)
      {
      case Settings::LINEAR:
        // Need an initial offset.
-	return parallel_partial_sum_linear(begin, end, result, bin_op,
+        return parallel_partial_sum_linear(begin, end, result, bin_op, n);
-					   n, num_threads);
      default:
    // Partial_sum algorithm not implemented.
        _GLIBCXX_PARALLEL_ASSERT(0);

--- a/libstdc++-v3/include/parallel/partition.h
+++ b/libstdc++-v3/include/parallel/partition.h
@@ -45,21 +45,21 @@
 #include <bits/stl_algo.h>
 #include <parallel/parallel.h>
-/** @brief Decide whether to declare certain variable volatile in this file. */
+/** @brief Decide whether to declare certain variables volatile. */
 #define _GLIBCXX_VOLATILE volatile
 namespace __gnu_parallel
 {
-  /** @brief Parallel implementation of std::partition.
+/** @brief Parallel implementation of std::partition.
  *  @param begin Begin iterator of input sequence to split.
  *  @param end End iterator of input sequence to split.
  *  @param pred Partition predicate, possibly including some kind of pivot.
-   *  @param max_num_threads Maximum number of threads to use for this task.
+  *  @param num_threads Maximum number of threads to use for this task.
  *  @return Number of elements not fulfilling the predicate. */
-  template<typename RandomAccessIterator, typename Predicate>
+template<typename RandomAccessIterator, typename Predicate>
-  inline typename std::iterator_traits<RandomAccessIterator>::difference_type
+  typename std::iterator_traits<RandomAccessIterator>::difference_type
  parallel_partition(RandomAccessIterator begin, RandomAccessIterator end,
-		     Predicate pred, thread_index_t max_num_threads)
+                     Predicate pred, thread_index_t num_threads)
  {
    typedef std::iterator_traits<RandomAccessIterator> traits_type;
    typedef typename traits_type::value_type value_type;
@@ -74,25 +74,37 @@ namespace __gnu_parallel
    _GLIBCXX_VOLATILE difference_type leftover_left, leftover_right;
    _GLIBCXX_VOLATILE difference_type leftnew, rightnew;
-    bool* reserved_left, * reserved_right;
+    bool* reserved_left = NULL, * reserved_right = NULL;
-    reserved_left = new bool[max_num_threads];
-    reserved_right = new bool[max_num_threads];
    difference_type chunk_size;
-    if (Settings::partition_chunk_share > 0.0)
-      chunk_size = std::max((difference_type)Settings::partition_chunk_size, (difference_type)((double)n * Settings::partition_chunk_share / (double)max_num_threads));
-    else
-      chunk_size = Settings::partition_chunk_size;
    omp_lock_t result_lock;
    omp_init_lock(&result_lock);
-    // At least good for two processors.
+    //at least two chunks per thread
-    while (right - left + 1 >= 2 * max_num_threads * chunk_size)
+    if(right - left + 1 >= 2 * num_threads * chunk_size)
+#   pragma omp parallel num_threads(num_threads)
+      {
+#       pragma omp single
+          {
+            num_threads = omp_get_num_threads();
+            reserved_left = new bool[num_threads];
+            reserved_right = new bool[num_threads];
+            if (Settings::partition_chunk_share > 0.0)
+              chunk_size = std::max<difference_type>(
+                  Settings::partition_chunk_size,
+                  (double)n * Settings::partition_chunk_share /
+                        (double)num_threads);
+            else
+              chunk_size = Settings::partition_chunk_size;
+          }
+        while (right - left + 1 >= 2 * num_threads * chunk_size)
+          {
+#           pragma omp single
              {
                difference_type num_chunks = (right - left + 1) / chunk_size;
-	thread_index_t num_threads = (int)std::min((difference_type)max_num_threads, num_chunks / 2);
                for (int r = 0; r < num_threads; r++)
                  {
@@ -101,11 +113,11 @@ namespace __gnu_parallel
                  }
                leftover_left = 0;
                leftover_right = 0;
+              } //implicit barrier
-#pragma omp parallel num_threads(num_threads)
-	{
            // Private.
-	  difference_type thread_left, thread_left_border, thread_right, thread_right_border;
+            difference_type thread_left, thread_left_border,
+                            thread_right, thread_right_border;
            thread_left = left + 1;
            // Just to satisfy the condition below.
@@ -150,12 +162,15 @@ namespace __gnu_parallel
                // Swap as usual.
                while (thread_left < thread_right)
                  {
-		  while (pred(begin[thread_left]) && thread_left <= thread_left_border)
+                    while (pred(begin[thread_left])
+                            && thread_left <= thread_left_border)
                      thread_left++;
-		  while (!pred(begin[thread_right]) && thread_right >= thread_right_border)
+                    while (!pred(begin[thread_right])
+                            && thread_right >= thread_right_border)
                      thread_right--;
-		  if (thread_left > thread_left_border || thread_right < thread_right_border)
+                    if (thread_left > thread_left_border
+                        || thread_right < thread_right_border)
                      // Fetch new chunk(s).
                      break;
@@ -167,28 +182,29 @@ namespace __gnu_parallel
            // Now swap the leftover chunks to the right places.
            if (thread_left <= thread_left_border)
-#pragma omp atomic
+#             pragma omp atomic
              leftover_left++;
            if (thread_right >= thread_right_border)
-#pragma omp atomic
+#             pragma omp atomic
              leftover_right++;
-#pragma omp barrier
+#           pragma omp barrier
-#pragma omp single
+#           pragma omp single
              {
                leftnew = left - leftover_left * chunk_size;
                rightnew = right + leftover_right * chunk_size;
              }
-#pragma omp barrier
+#           pragma omp barrier
            // <=> thread_left_border + (chunk_size - 1) >= leftnew
            if (thread_left <= thread_left_border
                && thread_left_border >= leftnew)
              {
                // Chunk already in place, reserve spot.
-	      reserved_left[(left - (thread_left_border + 1)) / chunk_size] = true;
+                reserved_left[(left - (thread_left_border + 1)) / chunk_size]
+                    = true;
              }
            // <=> thread_right_border - (chunk_size - 1) <= rightnew
@@ -196,12 +212,15 @@ namespace __gnu_parallel
                && thread_right_border <= rightnew)
              {
                // Chunk already in place, reserve spot.
-	      reserved_right[((thread_right_border - 1) - right) / chunk_size] = true;
+                reserved_right
+                    [((thread_right_border - 1) - right) / chunk_size]
+                    = true;
              }
-#pragma omp barrier
+#           pragma omp barrier
-	  if (thread_left <= thread_left_border && thread_left_border < leftnew)
+            if (thread_left <= thread_left_border
+                && thread_left_border < leftnew)
              {
                // Find spot and swap.
                difference_type swapstart = -1;
@@ -219,7 +238,10 @@ namespace __gnu_parallel
                _GLIBCXX_PARALLEL_ASSERT(swapstart != -1);
 #endif
-	      std::swap_ranges(begin + thread_left_border - (chunk_size - 1), begin + thread_left_border + 1, begin + swapstart);
+                std::swap_ranges(
+                    begin + thread_left_border - (chunk_size - 1),
+                    begin + thread_left_border + 1,
+                    begin + swapstart);
              }
            if (thread_right >= thread_right_border
@@ -241,12 +263,14 @@ namespace __gnu_parallel
                _GLIBCXX_PARALLEL_ASSERT(swapstart != -1);
 #endif
-	      std::swap_ranges(begin + thread_right_border, begin + thread_right_border + chunk_size, begin + swapstart);
+                std::swap_ranges(begin + thread_right_border,
+                                begin + thread_right_border + chunk_size,
+                                begin + swapstart);
              }
 #if _GLIBCXX_ASSERTIONS
-#pragma omp barrier
+#             pragma omp barrier
-#pragma omp single
+#             pragma omp single
                {
                  for (int r = 0; r < leftover_left; r++)
                    _GLIBCXX_PARALLEL_ASSERT(reserved_left[r]);
@@ -254,14 +278,16 @@ namespace __gnu_parallel
                    _GLIBCXX_PARALLEL_ASSERT(reserved_right[r]);
                }
-#pragma omp barrier
+#             pragma omp barrier
 #endif
-#pragma omp barrier
+#             pragma omp barrier
              left = leftnew;
              right = rightnew;
          }
-      }	// end "recursion"
+#         pragma omp flush(left, right)
+      } // end "recursion" //parallel
    difference_type final_left = left, final_right = right;
@@ -298,14 +324,14 @@ namespace __gnu_parallel
      return final_left + 1;
  }
-  /** 
+/**
  *  @brief Parallel implementation of std::nth_element().
  *  @param begin Begin iterator of input sequence.
  *  @param nth Iterator of element that must be in position afterwards.
  *  @param end End iterator of input sequence.
  *  @param comp Comparator.
  */
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
  void 
  parallel_nth_element(RandomAccessIterator begin, RandomAccessIterator nth, 
 		       RandomAccessIterator end, Comparator comp)
@@ -377,12 +403,12 @@ namespace __gnu_parallel
    __gnu_sequential::sort(begin, end, comp);
  }
-  /** @brief Parallel implementation of std::partial_sort().
+/** @brief Parallel implementation of std::partial_sort().
-   *  @param begin Begin iterator of input sequence.
+*  @param begin Begin iterator of input sequence.
-   *  @param middle Sort until this position.
+*  @param middle Sort until this position.
-   *  @param end End iterator of input sequence.
+*  @param end End iterator of input sequence.
-   *  @param comp Comparator. */
+*  @param comp Comparator. */
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
  void
  parallel_partial_sort(RandomAccessIterator begin, RandomAccessIterator middle, RandomAccessIterator end, Comparator comp)
  {

--- a/libstdc++-v3/include/parallel/quicksort.h
+++ b/libstdc++-v3/include/parallel/quicksort.h
@@ -53,11 +53,17 @@ namespace __gnu_parallel
   *  this part.
   */
  template<typename RandomAccessIterator, typename Comparator>
-  inline typename std::iterator_traits<RandomAccessIterator>::difference_type
+  inline
-  parallel_sort_qs_divide(RandomAccessIterator begin, RandomAccessIterator end,
+  typename std::iterator_traits<RandomAccessIterator>::difference_type
+  parallel_sort_qs_divide(
+      RandomAccessIterator begin,
+      RandomAccessIterator end,
      Comparator comp,
-			  typename std::iterator_traits<RandomAccessIterator>::difference_type pivot_rank,
+      typename std::iterator_traits<RandomAccessIterator>::difference_type
-			  typename std::iterator_traits<RandomAccessIterator>::difference_type num_samples, thread_index_t num_threads)
+          pivot_rank,
+      typename std::iterator_traits<RandomAccessIterator>::difference_type
+          num_samples,
+      thread_index_t num_threads)
  {
    typedef std::iterator_traits<RandomAccessIterator> traits_type;
    typedef typename traits_type::value_type value_type;
@@ -65,20 +71,24 @@ namespace __gnu_parallel
    difference_type n = end - begin;
    num_samples = std::min(num_samples, n);
-    value_type* samples = static_cast<value_type*>(__builtin_alloca(sizeof(value_type) * num_samples));
+    // Allocate uninitialized, to avoid default constructor.
+    value_type* samples = static_cast<value_type*>(
+        operator new(num_samples * sizeof(value_type)));
    for (difference_type s = 0; s < num_samples; s++)
      {
        const unsigned long long index = static_cast<unsigned long long>(s)
                        * n / num_samples;
-	samples[s] = begin[index];
+        new(samples + s) value_type(begin[index]);
      }
    __gnu_sequential::sort(samples, samples + num_samples, comp);
    value_type& pivot = samples[pivot_rank * num_samples / n];
-    __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, pivot);
+    __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool>
+        pred(comp, pivot);
    difference_type split = parallel_partition(begin, end, pred, num_threads);
    return split;
@@ -93,7 +103,10 @@ namespace __gnu_parallel
   */
  template<typename RandomAccessIterator, typename Comparator>
  inline void
-  parallel_sort_qs_conquer(RandomAccessIterator begin, RandomAccessIterator end, Comparator comp, int num_threads)
+  parallel_sort_qs_conquer(RandomAccessIterator begin,
+                           RandomAccessIterator end,
+                           Comparator comp,
+                           thread_index_t num_threads)
  {
    typedef std::iterator_traits<RandomAccessIterator> traits_type;
    typedef typename traits_type::value_type value_type;
@@ -110,24 +123,27 @@ namespace __gnu_parallel
    if (n <= 1)
      return;
-    thread_index_t num_processors_left;
+    thread_index_t num_threads_left;
    if ((num_threads % 2) == 1)
-      num_processors_left = num_threads / 2 + 1;
+      num_threads_left = num_threads / 2 + 1;
    else
-      num_processors_left = num_threads / 2;
+      num_threads_left = num_threads / 2;
-    pivot_rank = n * num_processors_left / num_threads;
+    pivot_rank = n * num_threads_left / num_threads;
-    difference_type split = parallel_sort_qs_divide(begin, end, comp, pivot_rank,
+    difference_type split = parallel_sort_qs_divide(
-Settings::sort_qs_num_samples_preset, num_threads);
+        begin, end, comp, pivot_rank,
+        Settings::sort_qs_num_samples_preset, num_threads);
 #pragma omp parallel sections
    {
 #pragma omp section
-      parallel_sort_qs_conquer(begin, begin + split, comp, num_processors_left);
+      parallel_sort_qs_conquer(begin, begin + split,
+                               comp, num_threads_left);
 #pragma omp section
-      parallel_sort_qs_conquer(begin + split, end, comp, num_threads - num_processors_left);
+      parallel_sort_qs_conquer(begin + split, end,
+                               comp, num_threads - num_threads_left);
    }
  }
@@ -143,9 +159,12 @@ Settings::sort_qs_num_samples_preset, num_threads);
   */
  template<typename RandomAccessIterator, typename Comparator>
  inline void
-  parallel_sort_qs(RandomAccessIterator begin, RandomAccessIterator end,
+  parallel_sort_qs(
+      RandomAccessIterator begin,
+      RandomAccessIterator end,
      Comparator comp,
-		   typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads)
+      typename std::iterator_traits<RandomAccessIterator>::difference_type n,
+      int num_threads)
  {
    _GLIBCXX_CALL(n)
@@ -165,10 +184,7 @@ Settings::sort_qs_num_samples_preset, num_threads);
    // Hard to avoid.
    omp_set_num_threads(num_threads);
-    bool old_nested = (omp_get_nested() != 0);
-    omp_set_nested(true);
    parallel_sort_qs_conquer(begin, begin + n, comp, num_threads);
-    omp_set_nested(old_nested);
  }
 } //namespace __gnu_parallel

--- a/libstdc++-v3/include/parallel/random_shuffle.h
+++ b/libstdc++-v3/include/parallel/random_shuffle.h
--- a/libstdc++-v3/include/parallel/search.h
+++ b/libstdc++-v3/include/parallel/search.h
@@ -53,7 +53,7 @@ namespace __gnu_parallel
   *  @param length Length of sequence to search for.
   *  @param advances Returned offsets. 
   */
-  template<typename RandomAccessIterator, typename _DifferenceTp>
+template<typename RandomAccessIterator, typename _DifferenceTp>
  void
  calc_borders(RandomAccessIterator elements, _DifferenceTp length, 
              _DifferenceTp* off)
@@ -81,7 +81,10 @@ namespace __gnu_parallel
   *  @param end2 End iterator of second sequence.
   *  @param pred Find predicate.
   *  @return Place of finding in first sequences. */
-  template<typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename Pred>
+template<
+    typename _RandomAccessIterator1,
+    typename _RandomAccessIterator2,
+    typename Pred>
  _RandomAccessIterator1
  search_template(_RandomAccessIterator1 begin1, _RandomAccessIterator1 end1,
                  _RandomAccessIterator2 begin2, _RandomAccessIterator2 end2,
@@ -103,27 +106,34 @@ namespace __gnu_parallel
    // Where is first occurrence of pattern? defaults to end.
    difference_type result = (end1 - begin1);
+    difference_type *splitters;
    // Pattern too long.
    if (input_length < 0)
      return end1;
-    thread_index_t num_threads = std::max<difference_type>(1, std::min<difference_type>(input_length, __gnu_parallel::get_max_threads()));
    omp_lock_t result_lock;
    omp_init_lock(&result_lock);
-    difference_type borders[num_threads + 1];
+    thread_index_t num_threads =
-    __gnu_parallel::equally_split(input_length, num_threads, borders);
+        std::max<difference_type>(1,
+            std::min<difference_type>(input_length, get_max_threads()));
    difference_type advances[pattern_length];
    calc_borders(begin2, pattern_length, advances);
-#pragma omp parallel num_threads(num_threads)
+#   pragma omp parallel num_threads(num_threads)
+      {
+#       pragma omp single
          {
+            num_threads = omp_get_num_threads();
+            splitters = new difference_type[num_threads + 1];
+            equally_split(input_length, num_threads, splitters);
+          }
        thread_index_t iam = omp_get_thread_num();
-      difference_type start = borders[iam], stop = borders[iam + 1];
+        difference_type start = splitters[iam], stop = splitters[iam + 1];
        difference_type pos_in_pattern = 0;
        bool found_pattern = false;
@@ -131,11 +141,12 @@ namespace __gnu_parallel
        while (start <= stop && !found_pattern)
          {
            // Get new value of result.
-#pragma omp flush(result)
+            #pragma omp flush(result)
            // No chance for this thread to find first occurrence.
            if (result < start)
              break;
-	  while (pred(begin1[start + pos_in_pattern], begin2[pos_in_pattern]))
+            while (pred(begin1[start + pos_in_pattern],
+                         begin2[pos_in_pattern]))
              {
                ++pos_in_pattern;
                if (pos_in_pattern == pattern_length)
@@ -151,12 +162,15 @@ namespace __gnu_parallel
              }
            // Make safe jump.
            start += (pos_in_pattern - advances[pos_in_pattern]);
-	  pos_in_pattern = (advances[pos_in_pattern] < 0) ? 0 : advances[pos_in_pattern];
+            pos_in_pattern =
-	}
+                (advances[pos_in_pattern] < 0) ? 0 : advances[pos_in_pattern];
          }
+      } //parallel
    omp_destroy_lock(&result_lock);
+    delete[] splitters;
    // Return iterator on found element.
    return (begin1 + result);
  }

--- a/libstdc++-v3/include/parallel/set_operations.h
+++ b/libstdc++-v3/include/parallel/set_operations.h
--- a/libstdc++-v3/include/parallel/unique_copy.h
+++ b/libstdc++-v3/include/parallel/unique_copy.h
@@ -44,13 +44,16 @@
 namespace __gnu_parallel
 {
-  /** @brief Parallel std::unique_copy(), without explicit equality predicate.
+/** @brief Parallel std::unique_copy(), w/o explicit equality predicate.
  *  @param first Begin iterator of input sequence.
  *  @param last End iterator of input sequence.
  *  @param result Begin iterator of result sequence.
  *  @param binary_pred Equality predicate.
  *  @return End iterator of result sequence. */
-  template<typename InputIterator, class OutputIterator, class BinaryPredicate>
+template<
+    typename InputIterator,
+    class OutputIterator,
+    class BinaryPredicate>
  inline OutputIterator
  parallel_unique_copy(InputIterator first, InputIterator last,
                       OutputIterator result, BinaryPredicate binary_pred)
@@ -62,20 +65,27 @@ namespace __gnu_parallel
    typedef typename traits_type::difference_type difference_type;
    difference_type size = last - first;
-    int num_threads = __gnu_parallel::get_max_threads();
-    difference_type counter[num_threads + 1];
    if (size == 0)
      return result;
    // Let the first thread process two parts.
-    difference_type borders[num_threads + 2];
+    difference_type *counter;
-    __gnu_parallel::equally_split(size, num_threads + 1, borders);
+    difference_type *borders;
+    thread_index_t num_threads = get_max_threads();
    // First part contains at least one element.
-#pragma omp parallel num_threads(num_threads)
+#   pragma omp parallel num_threads(num_threads)
      {
-      int iam = omp_get_thread_num();
+#       pragma omp single
+          {
+                num_threads = omp_get_num_threads();
+                borders = new difference_type[num_threads + 2];
+                equally_split(size, num_threads + 1, borders);
+                counter = new difference_type[num_threads + 1];
+          }
+        thread_index_t iam = omp_get_thread_num();
        difference_type begin, end;
@@ -83,6 +93,7 @@ namespace __gnu_parallel
        // Needed for position in output
        difference_type i = 0;
        OutputIterator out = result;
        if (iam == 0)
        {
          begin = borders[0] + 1;	// == 1
@@ -120,7 +131,7 @@ namespace __gnu_parallel
      // Last part still untouched.
      difference_type begin_output;
-#pragma omp barrier
+#     pragma omp barrier
      // Store result in output on calculated positions.
      begin_output = 0;
@@ -170,15 +181,17 @@ namespace __gnu_parallel
    for (int t = 0; t < num_threads + 1; t++)
      end_output += counter[t];
+    delete[] borders;
    return result + end_output;
  }
-  /** @brief Parallel std::unique_copy(), without explicit equality predicate
+/** @brief Parallel std::unique_copy(), without explicit equality predicate
  *  @param first Begin iterator of input sequence.
  *  @param last End iterator of input sequence.
  *  @param result Begin iterator of result sequence.
  *  @return End iterator of result sequence. */
-  template<typename InputIterator, class OutputIterator>
+template<typename InputIterator, class OutputIterator>
  inline OutputIterator
  parallel_unique_copy(InputIterator first, InputIterator last,
                       OutputIterator result)

--- a/libstdc++-v3/include/parallel/workstealing.h
+++ b/libstdc++-v3/include/parallel/workstealing.h
@@ -55,8 +55,8 @@ namespace __gnu_parallel
 #define _GLIBCXX_JOB_VOLATILE volatile
-  /** @brief One job for a certain thread. */
+/** @brief One job for a certain thread. */
-  template<typename _DifferenceTp>
+template<typename _DifferenceTp>
  struct Job
  {
    typedef _DifferenceTp difference_type;
@@ -78,7 +78,7 @@ namespace __gnu_parallel
    _GLIBCXX_JOB_VOLATILE difference_type load;
  };
-  /** @brief Work stealing algorithm for random access iterators.
+/** @brief Work stealing algorithm for random access iterators.
  *
  *  Uses O(1) additional memory. Synchronization at job lists is
  *  done with atomic operations.
@@ -96,13 +96,20 @@ namespace __gnu_parallel
  *  std::count_n()).
  *  @return User-supplied functor (that may contain a part of the result).
  */
-  template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
+template<
+    typename RandomAccessIterator,
+    typename Op,
+    typename Fu,
+    typename Red,
+    typename Result>
  Op
-  for_each_template_random_access_workstealing(RandomAccessIterator begin,
+  for_each_template_random_access_workstealing(
+      RandomAccessIterator begin,
      RandomAccessIterator end,
      Op op, Fu& f, Red r,
      Result base, Result& output,
-					       typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
+      typename std::iterator_traits<RandomAccessIterator>::difference_type
+          bound)
  {
    _GLIBCXX_CALL(end - begin)
@@ -110,34 +117,43 @@ namespace __gnu_parallel
    typedef typename traits_type::difference_type difference_type;
-    difference_type chunk_size = static_cast<difference_type>(Settings::workstealing_chunk_size);
+    difference_type chunk_size =
+        static_cast<difference_type>(Settings::workstealing_chunk_size);
    // How many jobs?
    difference_type length = (bound < 0) ? (end - begin) : bound;
    // To avoid false sharing in a cache line.
-    const int stride = Settings::cache_line_size * 10 / sizeof(Job<difference_type>) + 1;
+    const int stride =
+        Settings::cache_line_size * 10 / sizeof(Job<difference_type>) + 1;
    // Total number of threads currently working.
    thread_index_t busy = 0;
-    thread_index_t num_threads = get_max_threads();
-    difference_type num_threads_min = num_threads < end - begin ? num_threads : end - begin;
+    Job<difference_type> *job;
    omp_lock_t output_lock;
    omp_init_lock(&output_lock);
-    // No more threads than jobs, at least one thread.
-    difference_type num_threads_max = num_threads_min > 1 ? num_threads_min : 1;
-    num_threads = static_cast<thread_index_t>(num_threads_max);
-    // Create job description array.
-    Job<difference_type> *job = new Job<difference_type>[num_threads * stride];
    // Write base value to output.
    output = base;
-#pragma omp parallel shared(busy) num_threads(num_threads)
+    // No more threads than jobs, at least one thread.
+    thread_index_t num_threads =
+        __gnu_parallel::max<thread_index_t>(1,
+            __gnu_parallel::min<difference_type>(length, get_max_threads()));
+#   pragma omp parallel shared(busy) num_threads(num_threads)
      {
+#       pragma omp single
+          {
+            num_threads = omp_get_num_threads();
+            // Create job description array.
+            job = new Job<difference_type>[num_threads * stride];
+          }
        // Initialization phase.
        // Flags for every thread if it is doing productive work.
@@ -158,19 +174,22 @@ namespace __gnu_parallel
        // Number of elements to steal in one attempt.
        difference_type steal;
-      // Every thread has its own random number generator (modulo num_threads).
+        // Every thread has its own random number generator
+        // (modulo num_threads).
        random_number rand_gen(iam, num_threads);
-#pragma omp atomic
        // This thread is currently working.
+#       pragma omp atomic
          busy++;
        iam_working = true;
        // How many jobs per thread? last thread gets the rest.
-      my_job.first = static_cast<difference_type>(iam * (length / num_threads));
+        my_job.first =
+            static_cast<difference_type>(iam * (length / num_threads));
-      my_job.last = (iam == (num_threads - 1)) ? (length - 1) : ((iam + 1) * (length / num_threads) - 1);
+        my_job.last = (iam == (num_threads - 1)) ?
+            (length - 1) : ((iam + 1) * (length / num_threads) - 1);
        my_job.load = my_job.last - my_job.first + 1;
        // Init result with first value (to have a base value for reduction).
@@ -185,26 +204,29 @@ namespace __gnu_parallel
        RandomAccessIterator current;
-#pragma omp barrier
+#       pragma omp barrier
        // Actual work phase
        // Work on own or stolen start
        while (busy > 0)
          {
            // Work until no productive thread left.
-#pragma omp flush(busy)
+#           pragma omp flush(busy)
            // Thread has own work to do
            while (my_job.first <= my_job.last)
              {
                // fetch-and-add call
                // Reserve current job block (size chunk_size) in my queue.
-	      difference_type current_job = fetch_and_add<difference_type>(&(my_job.first), chunk_size);
+                difference_type current_job =
+                  fetch_and_add<difference_type>(&(my_job.first), chunk_size);
                // Update load, to make the three values consistent,
                // first might have been changed in the meantime
                my_job.load = my_job.last - my_job.first + 1;
-	      for (difference_type job_counter = 0; job_counter < chunk_size && current_job <= my_job.last; job_counter++)
+                for (difference_type job_counter = 0;
+                     job_counter < chunk_size && current_job <= my_job.last;
+                     job_counter++)
                  {
                    // Yes: process it!
                    current = begin + current_job;
@@ -214,15 +236,14 @@ namespace __gnu_parallel
                    result = r(result, f(op, current));
                  }
-#pragma omp flush(busy)
+#               pragma omp flush(busy)
              }
            // After reaching this point, a thread's job list is empty.
            if (iam_working)
              {
-#pragma omp atomic
                // This thread no longer has work.
+#               pragma omp atomic
                busy--;
                iam_working = false;
@@ -231,16 +252,17 @@ namespace __gnu_parallel
            difference_type supposed_first, supposed_last, supposed_load;
            do
              {
-	      // Find random nonempty deque (not own) and do consistency check.
+                // Find random nonempty deque (not own), do consistency check.
                yield();
-#pragma omp flush(busy)
+#               pragma omp flush(busy)
                victim = rand_gen();
                supposed_first = job[victim * stride].first;
                supposed_last = job[victim * stride].last;
                supposed_load = job[victim * stride].load;
              }
            while (busy > 0
-		 && ((supposed_load <= 0) || ((supposed_first + supposed_load - 1) != supposed_last)));
+              && ((supposed_load <= 0)
+                || ((supposed_first + supposed_load - 1) != supposed_last)));
            if (busy == 0)
              break;
@@ -251,40 +273,30 @@ namespace __gnu_parallel
                // Number of elements to steal (at least one).
                steal = (supposed_load < 2) ? 1 : supposed_load / 2;
-	      // Protects against stealing threads
-	      // omp_set_lock(&(job[victim * stride].lock));
                // Push victim's start forward.
-	      difference_type stolen_first = fetch_and_add<difference_type>(&(job[victim * stride].first), steal);
+                difference_type stolen_first =
-	      difference_type stolen_try = stolen_first + steal - difference_type(1);
+                    fetch_and_add<difference_type>(
+                        &(job[victim * stride].first), steal);
-	      // Protects against working thread
+                difference_type stolen_try =
-	      // omp_unset_lock(&(job[victim * stride].lock));
+                    stolen_first + steal - difference_type(1);
                my_job.first = stolen_first;
+                my_job.last = __gnu_parallel::min(stolen_try, supposed_last);
-	      // Avoid std::min dependencies.
-	      my_job.last = stolen_try < supposed_last ? stolen_try : supposed_last;
                my_job.load = my_job.last - my_job.first + 1;
-	      //omp_unset_lock(&(my_job.lock));
-#pragma omp atomic
                // Has potential work again.
+#               pragma omp atomic
                  busy++;
                iam_working = true;
-#pragma omp flush(busy)
+#               pragma omp flush(busy)
              }
-#pragma omp flush(busy)
+#           pragma omp flush(busy)
          } // end while busy > 0
            // Add accumulated result to output.
        omp_set_lock(&output_lock);
        output = r(output, result);
        omp_unset_lock(&output_lock);
-      //omp_destroy_lock(&(my_job.lock));
      }
    delete[] job;