re PR libstdc++/33893 ([parallel mode] Algorithms rely on omp_set_dynamic(false))

2007-11-22 Johannes Singler <singler@ira.uka.de> PR libstdc++/33893 * include/parallel/multiway_merge.h: made omp_dynamic-safe * include/parallel/workstealing.h: made omp_dynamic-safe * include/parallel/base.h: infrastructure, cleanup * include/parallel/par_loop.h: made omp_dynamic-safe * include/parallel/features.h: activate loser tree variant * include/parallel/quicksort.h: made omp_dynamic-safe * include/parallel/compiletime_settings.h: settings overridable * include/parallel/equally_split.h: made omp_dynamic-safe * include/parallel/omp_loop_static.h: made omp_dynamic-safe * include/parallel/random_shuffle.h: made omp_dynamic-safe * include/parallel/balanced_quicksort.h: made omp_dynamic-safe * include/parallel/set_operations.h: made omp_dynamic-safe * include/parallel/unique_copy.h: made omp_dynamic-safe * include/parallel/multiway_mergesort.h: made omp_dynamic-safe * include/parallel/search.h: made omp_dynamic-safe * include/parallel/partition.h: made omp_dynamic-safe * include/parallel/partial_sum.h: made omp_dynamic-safe * include/parallel/find.h: made omp_dynamic-safe * include/parallel/omp_loop.h: made omp_dynamic-safe * include/parallel/losertree.h: avoid default constructor From-SVN: r130347

re PR libstdc++/33893 ([parallel mode] Algorithms rely on omp_set_dynamic(false))
2007-11-22 Johannes Singler <singler@ira.uka.de> PR libstdc++/33893 * include/parallel/multiway_merge.h: made omp_dynamic-safe * include/parallel/workstealing.h: made omp_dynamic-safe * include/parallel/base.h: infrastructure, cleanup * include/parallel/par_loop.h: made omp_dynamic-safe * include/parallel/features.h: activate loser tree variant * include/parallel/quicksort.h: made omp_dynamic-safe * include/parallel/compiletime_settings.h: settings overridable * include/parallel/equally_split.h: made omp_dynamic-safe * include/parallel/omp_loop_static.h: made omp_dynamic-safe * include/parallel/random_shuffle.h: made omp_dynamic-safe * include/parallel/balanced_quicksort.h: made omp_dynamic-safe * include/parallel/set_operations.h: made omp_dynamic-safe * include/parallel/unique_copy.h: made omp_dynamic-safe * include/parallel/multiway_mergesort.h: made omp_dynamic-safe * include/parallel/search.h: made omp_dynamic-safe * include/parallel/partition.h: made omp_dynamic-safe * include/parallel/partial_sum.h: made omp_dynamic-safe * include/parallel/find.h: made omp_dynamic-safe * include/parallel/omp_loop.h: made omp_dynamic-safe * include/parallel/losertree.h: avoid default constructor From-SVN: r130347
e683ee2a · Johannes Singler · Johannes Singler · 7861a5ce · e683ee2a · e683ee2a
Commit e683ee2a authored Nov 22, 2007 by Johannes Singler Committed by Johannes Singler Nov 22, 2007
21 changed files
--- a/libstdc++-v3/ChangeLog
+++ b/libstdc++-v3/ChangeLog
+2007-11-22  Johannes Singler  <singler@ira.uka.de>
+
+	PR libstdc++/33893
+        * include/parallel/multiway_merge.h: made omp_dynamic-safe
+        * include/parallel/workstealing.h: made omp_dynamic-safe
+        * include/parallel/base.h: infrastructure, cleanup
+        * include/parallel/par_loop.h: made omp_dynamic-safe
+        * include/parallel/features.h: activate loser tree variant
+        * include/parallel/quicksort.h: made omp_dynamic-safe
+        * include/parallel/compiletime_settings.h: settings overridable
+        * include/parallel/equally_split.h: made omp_dynamic-safe
+        * include/parallel/omp_loop_static.h: made omp_dynamic-safe
+        * include/parallel/random_shuffle.h: made omp_dynamic-safe
+        * include/parallel/balanced_quicksort.h: made omp_dynamic-safe
+        * include/parallel/set_operations.h: made omp_dynamic-safe
+        * include/parallel/unique_copy.h: made omp_dynamic-safe
+        * include/parallel/multiway_mergesort.h: made omp_dynamic-safe
+        * include/parallel/search.h: made omp_dynamic-safe
+        * include/parallel/partition.h: made omp_dynamic-safe
+        * include/parallel/partial_sum.h: made omp_dynamic-safe
+        * include/parallel/find.h: made omp_dynamic-safe
+        * include/parallel/omp_loop.h: made omp_dynamic-safe
+        * include/parallel/losertree.h: avoid default constructor
+
 2007-11-21  Jonathan Wakely  <jwakely.gcc@gmail.com>

 	* docs/html/17_intro/C++STYLE: Fix typos.

--- a/libstdc++-v3/include/parallel/balanced_quicksort.h
+++ b/libstdc++-v3/include/parallel/balanced_quicksort.h
--- a/libstdc++-v3/include/parallel/base.h
+++ b/libstdc++-v3/include/parallel/base.h
--- a/libstdc++-v3/include/parallel/compiletime_settings.h
+++ b/libstdc++-v3/include/parallel/compiletime_settings.h
@@ -39,7 +39,7 @@
 #include <cstdio>

 /** @brief Determine verbosity level of the parallel mode.
- *  Level 1 prints a message each time when entering a parallel-mode function. */
+ *  Level 1 prints a message each time a parallel-mode function is entered. */
 #define _GLIBCXX_VERBOSE_LEVEL 0

 /** @def _GLIBCXX_CALL
@@ -50,27 +50,40 @@
 #define _GLIBCXX_CALL(n)
 #endif
 #if (_GLIBCXX_VERBOSE_LEVEL == 1)
-#define _GLIBCXX_CALL(n) printf("   %s:\niam = %d, n = %ld, num_threads = %d\n", __PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads());
+#define _GLIBCXX_CALL(n) \
+  printf("   %s:\niam = %d, n = %ld, num_threads = %d\n", \
+  __PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads());
 #endif

+#ifndef _GLIBCXX_SCALE_DOWN_FPU
 /** @brief Use floating-point scaling instead of modulo for mapping
 *  random numbers to a range.  This can be faster on certain CPUs. */
 #define _GLIBCXX_SCALE_DOWN_FPU 0
+#endif

+#ifndef _GLIBCXX_ASSERTIONS
 /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
 *  Should be switched on only locally. */
 #define _GLIBCXX_ASSERTIONS 0
+#endif

+#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
 /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
- *  Consider the size of the L1 cache for __gnu_parallel::parallel_random_shuffle(). */
+ *  Consider the size of the L1 cache for
+ *  __gnu_parallel::parallel_random_shuffle(). */
 #define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 0
+#endif
+#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 
 /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
- *  Consider the size of the TLB for __gnu_parallel::parallel_random_shuffle(). */
+ *  Consider the size of the TLB for
+ *  __gnu_parallel::parallel_random_shuffle(). */
 #define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 0
+#endif

+#ifndef _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
 /** @brief First copy the data, sort it locally, and merge it back
 * (0); or copy it back after everything is done (1).
 *
 *  Recommendation: 0 */
 #define _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST 0
-
+#endif
--- a/libstdc++-v3/include/parallel/equally_split.h
+++ b/libstdc++-v3/include/parallel/equally_split.h
@@ -39,30 +39,58 @@

 namespace __gnu_parallel
 {
-  /** @brief Function to split a sequence into parts of almost equal size.
-   *
-   *  The resulting sequence s of length p+1 contains the splitting
-   *  positions when splitting the range [0,n) into parts of almost
-   *  equal size (plus minus 1).  The first entry is 0, the last one
-   *  n. There may result empty parts.
-   *  @param n Number of elements
-   *  @param p Number of parts
-   *  @param s Splitters
-   *  @returns End of splitter sequence, i. e. @c s+p+1 */
-  template<typename _DifferenceTp, typename OutputIterator>
+/** @brief Function to split a sequence into parts of almost equal size.
+ *
+ *  The resulting sequence s of length num_threads+1 contains the splitting
+ *  positions when splitting the range [0,n) into parts of almost
+ *  equal size (plus minus 1).  The first entry is 0, the last one
+ *  n. There may result empty parts.
+ *  @param n Number of elements
+ *  @param num_threads Number of parts
+ *  @param s Splitters
+ *  @returns End of splitter sequence, i. e. @c s+num_threads+1 */
+template<typename difference_type, typename OutputIterator>
  OutputIterator
-  equally_split(_DifferenceTp n, thread_index_t p, OutputIterator s)
+  equally_split(difference_type n,
+                thread_index_t num_threads,
+                OutputIterator s)
  {
-    typedef _DifferenceTp difference_type;
-    difference_type chunk_length = n / p, split = n % p, start = 0;
-    for (int i = 0; i < p; i++)
+    difference_type chunk_length = n / num_threads,
+                    num_longer_chunks = n % num_threads,
+                    pos = 0;
+    for (thread_index_t i = 0; i < num_threads; ++i)
      {
-	*s++ = start;
-	start += (difference_type(i) < split) ? (chunk_length + 1) : chunk_length;
+        *s++ = pos;
+        pos += (i < num_longer_chunks) ? (chunk_length + 1) : chunk_length;
      }
    *s++ = n;
    return s;
  }
+
+
+/** @brief Function to split a sequence into parts of almost equal size.
+ *
+ *  Returns the position of the splitting point between
+ *  thread number thread_no (included) and
+ *  thread number thread_no+1 (excluded).
+ *  @param n Number of elements
+ *  @param num_threads Number of parts
+ *  @returns Splitting point */
+template<typename difference_type>
+  difference_type
+  equally_split_point(difference_type n,
+                      thread_index_t num_threads,
+                      thread_index_t thread_no)
+  {
+    difference_type chunk_length = n / num_threads,
+                    num_longer_chunks = n % num_threads;
+
+    if(thread_no < num_longer_chunks)
+      return thread_no * (chunk_length + 1);
+    else
+      return num_longer_chunks * (chunk_length + 1)
+          + (thread_no - num_longer_chunks) * chunk_length;
+  }
 }

 #endif
--- a/libstdc++-v3/include/parallel/features.h
+++ b/libstdc++-v3/include/parallel/features.h
@@ -66,7 +66,7 @@
 *  @brief Include guarded (sequences may run empty) loser tree,
 *  moving objects.
 *  @see __gnu_parallel::Settings multiway_merge_algorithm */
-#define _GLIBCXX_LOSER_TREE 0
+#define _GLIBCXX_LOSER_TREE 1
 #endif

 #ifndef _GLIBCXX_LOSER_TREE_EXPLICIT

--- a/libstdc++-v3/include/parallel/find.h
+++ b/libstdc++-v3/include/parallel/find.h
--- a/libstdc++-v3/include/parallel/losertree.h
+++ b/libstdc++-v3/include/parallel/losertree.h
--- a/libstdc++-v3/include/parallel/multiway_merge.h
+++ b/libstdc++-v3/include/parallel/multiway_merge.h
--- a/libstdc++-v3/include/parallel/multiway_mergesort.h
+++ b/libstdc++-v3/include/parallel/multiway_mergesort.h
--- a/libstdc++-v3/include/parallel/omp_loop.h
+++ b/libstdc++-v3/include/parallel/omp_loop.h
@@ -43,54 +43,71 @@

 #include <parallel/settings.h>
 #include <parallel/basic_iterator.h>
+#include <parallel/base.h>

 namespace __gnu_parallel
 {
-  /** @brief Embarrassingly parallel algorithm for random access
-   * iterators, using an OpenMP for loop.
-   *
-   *  @param begin Begin iterator of element sequence.
-   *  @param end End iterator of element sequence.
-   *  @param o User-supplied functor (comparator, predicate, adding
-   *  functor, etc.).
-   *  @param f Functor to "process" an element with op (depends on
-   *  desired functionality, e. g. for std::for_each(), ...).
-   *  @param r Functor to "add" a single result to the already
-   *  processed elements (depends on functionality).
-   *  @param base Base value for reduction.
-   *  @param output Pointer to position where final result is written to
-   *  @param bound Maximum number of elements processed (e. g. for
-   *  std::count_n()).
-   *  @return User-supplied functor (that may contain a part of the result).
-   */
-  template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
+/** @brief Embarrassingly parallel algorithm for random access
+  * iterators, using an OpenMP for loop.
+  *
+  *  @param begin Begin iterator of element sequence.
+  *  @param end End iterator of element sequence.
+  *  @param o User-supplied functor (comparator, predicate, adding
+  *  functor, etc.).
+  *  @param f Functor to "process" an element with op (depends on
+  *  desired functionality, e. g. for std::for_each(), ...).
+  *  @param r Functor to "add" a single result to the already
+  *  processed elements (depends on functionality).
+  *  @param base Base value for reduction.
+  *  @param output Pointer to position where final result is written to
+  *  @param bound Maximum number of elements processed (e. g. for
+  *  std::count_n()).
+  *  @return User-supplied functor (that may contain a part of the result).
+  */
+template<typename RandomAccessIterator,
+            typename Op,
+            typename Fu,
+            typename Red,
+            typename Result>
  Op
-  for_each_template_random_access_omp_loop(RandomAccessIterator begin, RandomAccessIterator end, Op o, Fu& f, Red r, Result base, Result& output, typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
+  for_each_template_random_access_omp_loop(
+             RandomAccessIterator begin,
+             RandomAccessIterator end,
+             Op o, Fu& f, Red r, Result base, Result& output,
+             typename std::iterator_traits<RandomAccessIterator>::
+                 difference_type bound)
  {
-    typedef typename std::iterator_traits<RandomAccessIterator>::difference_type difference_type;
+    typedef typename
+        std::iterator_traits<RandomAccessIterator>::difference_type
+        difference_type;

-    thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : static_cast<thread_index_t>((end - begin));
-    Result *thread_results = new Result[num_threads];
    difference_type length = end - begin;
+    thread_index_t num_threads =
+        __gnu_parallel::min<difference_type>(get_max_threads(), length);

-    for (thread_index_t i = 0; i < num_threads; i++)
+    Result *thread_results;
+
+#   pragma omp parallel num_threads(num_threads)
      {
-	thread_results[i] = r(thread_results[i], f(o, begin+i));
-      }
-
-#pragma omp parallel num_threads(num_threads)
-    {
-#pragma omp for schedule(dynamic, Settings::workstealing_chunk_size)
-      for (difference_type pos = 0; pos < length; pos++)
-	{
-	  thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos));
-	}
-    }
+#       pragma omp single
+          {
+            num_threads = omp_get_num_threads();
+            thread_results = new Result[num_threads];
+
+            for (thread_index_t i = 0; i < num_threads; i++)
+              thread_results[i] = Result();
+          }
+
+        thread_index_t iam = omp_get_thread_num();
+
+#       pragma omp for schedule(dynamic, Settings::workstealing_chunk_size)
+        for (difference_type pos = 0; pos < length; pos++)
+          thread_results[iam] =
+              r(thread_results[iam], f(o, begin+pos));
+      } //parallel

    for (thread_index_t i = 0; i < num_threads; i++)
-      {
-	output = r(output, thread_results[i]);
-      }
+        output = r(output, thread_results[i]);

    delete [] thread_results;

@@ -100,6 +117,7 @@ namespace __gnu_parallel

    return o;
  }
+
 } // end namespace

 #endif
--- a/libstdc++-v3/include/parallel/omp_loop_static.h
+++ b/libstdc++-v3/include/parallel/omp_loop_static.h
@@ -64,39 +64,50 @@ namespace __gnu_parallel
   *  std::count_n()).
   *  @return User-supplied functor (that may contain a part of the result).
   */
-  template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
+template<typename RandomAccessIterator,
+          typename Op,
+          typename Fu,
+          typename Red,
+          typename Result>
  Op
-  for_each_template_random_access_omp_loop_static(RandomAccessIterator begin,
-						  RandomAccessIterator end,
-						  Op o, Fu& f, Red r,
-						  Result base, Result& output,
-						  typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
+  for_each_template_random_access_omp_loop_static(
+              RandomAccessIterator begin,
+              RandomAccessIterator end,
+              Op o, Fu& f, Red r, Result base, Result& output,
+              typename std::iterator_traits<RandomAccessIterator>::
+                  difference_type bound)
  {
-    typedef std::iterator_traits<RandomAccessIterator> traits_type;
-    typedef typename traits_type::difference_type difference_type;
+    typedef typename
+        std::iterator_traits<RandomAccessIterator>::difference_type
+        difference_type;

-    thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : (end - begin);
-    Result *thread_results = new Result[num_threads];
    difference_type length = end - begin;
+    thread_index_t num_threads =
+        std::min<difference_type>(get_max_threads(), length);

-    for (thread_index_t i = 0; i < num_threads; i++)
+    Result *thread_results;
+
+#   pragma omp parallel num_threads(num_threads)
      {
-	thread_results[i] = r(thread_results[i], f(o, begin+i));
-      }
-
-#pragma omp parallel num_threads(num_threads)
-    {
-#pragma omp for schedule(static, Settings::workstealing_chunk_size)
-      for (difference_type pos = 0; pos < length; pos++)
-	{
-	  thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos));
-	}
-    }
+#       pragma omp single
+          {
+            num_threads = omp_get_num_threads();
+            thread_results = new Result[num_threads];
+
+            for (thread_index_t i = 0; i < num_threads; i++)
+              thread_results[i] = Result();
+          }
+
+        thread_index_t iam = omp_get_thread_num();
+
+#       pragma omp for schedule(static, Settings::workstealing_chunk_size)
+        for (difference_type pos = 0; pos < length; pos++)
+          thread_results[iam] =
+              r(thread_results[iam], f(o, begin+pos));
+      } //parallel

    for (thread_index_t i = 0; i < num_threads; i++)
-      {
-	output = r(output, thread_results[i]);
-      }
+        output = r(output, thread_results[i]);

    delete [] thread_results;

@@ -106,6 +117,7 @@ namespace __gnu_parallel

    return o;
  }
+
 } // end namespace

 #endif
--- a/libstdc++-v3/include/parallel/par_loop.h
+++ b/libstdc++-v3/include/parallel/par_loop.h
@@ -41,69 +41,80 @@

 #include <omp.h>
 #include <parallel/settings.h>
+#include <parallel/base.h>

 namespace __gnu_parallel
 {

-  /** @brief Embarrassingly parallel algorithm for random access
-   * iterators, using hand-crafted parallelization by equal splitting
-   * the work.
-   *
-   *  @param begin Begin iterator of element sequence.
-   *  @param end End iterator of element sequence.
-   *  @param o User-supplied functor (comparator, predicate, adding
-   *  functor, ...)
-   *  @param f Functor to "process" an element with op (depends on
-   *  desired functionality, e. g. for std::for_each(), ...).
-   *  @param r Functor to "add" a single result to the already
-   *  processed elements (depends on functionality).
-   *  @param base Base value for reduction.
-   *  @param output Pointer to position where final result is written to
-   *  @param bound Maximum number of elements processed (e. g. for
-   *  std::count_n()).
-   *  @return User-supplied functor (that may contain a part of the result).
-   */
-  template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
+/** @brief Embarrassingly parallel algorithm for random access
+  * iterators, using hand-crafted parallelization by equal splitting
+  * the work.
+  *
+  *  @param begin Begin iterator of element sequence.
+  *  @param end End iterator of element sequence.
+  *  @param o User-supplied functor (comparator, predicate, adding
+  *  functor, ...)
+  *  @param f Functor to "process" an element with op (depends on
+  *  desired functionality, e. g. for std::for_each(), ...).
+  *  @param r Functor to "add" a single result to the already
+  *  processed elements (depends on functionality).
+  *  @param base Base value for reduction.
+  *  @param output Pointer to position where final result is written to
+  *  @param bound Maximum number of elements processed (e. g. for
+  *  std::count_n()).
+  *  @return User-supplied functor (that may contain a part of the result).
+  */
+template<
+    typename RandomAccessIterator,
+    typename Op,
+    typename Fu,
+    typename Red,
+    typename Result>
  Op
-  for_each_template_random_access_ed(RandomAccessIterator begin,
-				     RandomAccessIterator end, Op o, Fu& f,
-				     Red r, Result base, Result& output,
-				     typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
+  for_each_template_random_access_ed(
+              RandomAccessIterator begin,
+              RandomAccessIterator end,
+              Op o, Fu& f, Red r, Result base, Result& output,
+              typename std::iterator_traits<RandomAccessIterator>::
+                  difference_type bound)
  {
    typedef std::iterator_traits<RandomAccessIterator> traits_type;
    typedef typename traits_type::difference_type difference_type;

    const difference_type length = end - begin;
-    const difference_type settings_threads = static_cast<difference_type>(get_max_threads());
-    const difference_type dmin = settings_threads < length ? settings_threads : length;
-    const difference_type dmax = dmin > 1 ? dmin : 1;
+    Result *thread_results;

-    thread_index_t num_threads = static_cast<thread_index_t>(dmax);
+    thread_index_t num_threads =
+        __gnu_parallel::min<difference_type>(get_max_threads(), length);

+#   pragma omp parallel num_threads(num_threads)
+      {
+#       pragma omp single
+          {
+            num_threads = omp_get_num_threads();
+            thread_results = new Result[num_threads];
+          }

-    Result *thread_results = new Result[num_threads];
+        thread_index_t iam = omp_get_thread_num();

-#pragma omp parallel num_threads(num_threads)
-    {
-      // Neutral element.
-      Result reduct = Result();
+        // Neutral element.
+        Result reduct = Result();

-      thread_index_t p = num_threads;
-      thread_index_t iam = omp_get_thread_num();
-      difference_type start = iam * length / p;
-      difference_type limit = (iam == p - 1) ? length : (iam + 1) * length / p;
+        difference_type
+            start = equally_split_point(length, num_threads, iam),
+            stop = equally_split_point(length, num_threads, iam + 1);

-      if (start < limit)
-	{
-	  reduct = f(o, begin + start);
-	  start++;
-	}
+        if (start < stop)
+          {
+            reduct = f(o, begin + start);
+            ++start;
+          }

-      for (; start < limit; start++)
-	reduct = r(reduct, f(o, begin + start));
+        for (; start < stop; ++start)
+          reduct = r(reduct, f(o, begin + start));

-      thread_results[iam] = reduct;
-    }
+        thread_results[iam] = reduct;
+      } //parallel

    for (thread_index_t i = 0; i < num_threads; i++)
      output = r(output, thread_results[i]);

--- a/libstdc++-v3/include/parallel/partial_sum.h
+++ b/libstdc++-v3/include/parallel/partial_sum.h
@@ -48,130 +48,156 @@ namespace __gnu_parallel
 {
  // Problem: there is no 0-element given.

-  /** @brief Base case prefix sum routine.
-   *  @param begin Begin iterator of input sequence.
-   *  @param end End iterator of input sequence.
-   *  @param result Begin iterator of output sequence.
-   *  @param bin_op Associative binary function.
-   *  @param value Start value. Must be passed since the neutral
-   *  element is unknown in general.
-   *  @return End iterator of output sequence. */
-  template<typename InputIterator, typename OutputIterator, typename BinaryOperation>
+/** @brief Base case prefix sum routine.
+  *  @param begin Begin iterator of input sequence.
+  *  @param end End iterator of input sequence.
+  *  @param result Begin iterator of output sequence.
+  *  @param bin_op Associative binary function.
+  *  @param value Start value. Must be passed since the neutral
+  *  element is unknown in general.
+  *  @return End iterator of output sequence. */
+template<
+    typename InputIterator,
+    typename OutputIterator,
+    typename BinaryOperation>
  inline OutputIterator
-  parallel_partial_sum_basecase(InputIterator begin, InputIterator end,
-				OutputIterator result, BinaryOperation bin_op,
-				typename std::iterator_traits<InputIterator>::value_type value)
+  parallel_partial_sum_basecase(
+            InputIterator begin, InputIterator end,
+            OutputIterator result, BinaryOperation bin_op,
+            typename std::iterator_traits<InputIterator>::value_type value)
  {
    if (begin == end)
      return result;

    while (begin != end)
      {
-	value = bin_op(value, *begin);
-	*result = value;
-	result++;
-	begin++;
+        value = bin_op(value, *begin);
+        *result = value;
+        result++;
+        begin++;
      }
    return result;
  }

-  /** @brief Parallel partial sum implementation, two-phase approach,
-      no recursion.
-      *  @param begin Begin iterator of input sequence.
-      *  @param end End iterator of input sequence.
-      *  @param result Begin iterator of output sequence.
-      *  @param bin_op Associative binary function.
-      *  @param n Length of sequence.
-      *  @param num_threads Number of threads to use.
-      *  @return End iterator of output sequence.
-      */
-  template<typename InputIterator, typename OutputIterator, typename BinaryOperation>
+/** @brief Parallel partial sum implementation, two-phase approach,
+    no recursion.
+    *  @param begin Begin iterator of input sequence.
+    *  @param end End iterator of input sequence.
+    *  @param result Begin iterator of output sequence.
+    *  @param bin_op Associative binary function.
+    *  @param n Length of sequence.
+    *  @param num_threads Number of threads to use.
+    *  @return End iterator of output sequence.
+    */
+template<
+    typename InputIterator,
+    typename OutputIterator,
+    typename BinaryOperation>
  OutputIterator
-  parallel_partial_sum_linear(InputIterator begin, InputIterator end,
-			      OutputIterator result, BinaryOperation bin_op,
-			      typename std::iterator_traits<InputIterator>::difference_type n, int num_threads)
+  parallel_partial_sum_linear(
+            InputIterator begin, InputIterator end,
+            OutputIterator result, BinaryOperation bin_op,
+            typename std::iterator_traits<InputIterator>::difference_type n)
  {
    typedef std::iterator_traits<InputIterator> traits_type;
    typedef typename traits_type::value_type value_type;
    typedef typename traits_type::difference_type difference_type;

-    if (num_threads > (n - 1))
-      num_threads = static_cast<thread_index_t>(n - 1);
+    thread_index_t num_threads =
+        std::min<difference_type>(get_max_threads(), n - 1);
+
    if (num_threads < 2)
      {
-	*result = *begin;
-	return parallel_partial_sum_basecase(begin + 1, end, result + 1, bin_op, *begin);
+        *result = *begin;
+        return parallel_partial_sum_basecase(
+            begin + 1, end, result + 1, bin_op, *begin);
      }

-    difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 2)));
+    difference_type* borders;
+    value_type* sums;

-    if (Settings::partial_sum_dilatation == 1.0f)
-      equally_split(n, num_threads + 1, borders);
-    else
+#   pragma omp parallel num_threads(num_threads)
      {
-	difference_type chunk_length = (int)((double)n / ((double)num_threads + Settings::partial_sum_dilatation)), borderstart = n - num_threads * chunk_length;
-	borders[0] = 0;
-	for (int i = 1; i < (num_threads + 1); i++)
-	  {
-	    borders[i] = borderstart;
-	    borderstart += chunk_length;
-	  }
-	borders[num_threads + 1] = n;
-      }
-
-    value_type* sums = static_cast<value_type*>(::operator new(sizeof(value_type) * num_threads));
-    OutputIterator target_end;
-
-#pragma omp parallel num_threads(num_threads)
-    {
-      int id = omp_get_thread_num();
-      if (id == 0)
-	{
-	  *result = *begin;
-	  parallel_partial_sum_basecase(begin + 1, begin + borders[1], 
-					result + 1, bin_op, *begin);
-	  sums[0] = *(result + borders[1] - 1);
-	}
-      else
-	{
-	  sums[id] = std::accumulate(begin + borders[id] + 1, 
-				     begin + borders[id + 1], 
-				     *(begin + borders[id]), 
-				     bin_op, __gnu_parallel::sequential_tag());
-	}
-
-#pragma omp barrier
-
-#pragma omp single
-      parallel_partial_sum_basecase(sums + 1, sums + num_threads, sums + 1, 
-				    bin_op, sums[0]);
-
-#pragma omp barrier
-
-      // Still same team.
-      parallel_partial_sum_basecase(begin + borders[id + 1], 
-				    begin + borders[id + 2], 
-				    result + borders[id + 1], bin_op, 
-				    sums[id]);
-    }
-
-    delete [] sums;
+#       pragma omp single
+          {
+            num_threads = omp_get_num_threads();
+
+            borders = new difference_type[num_threads + 2];
+
+            if (Settings::partial_sum_dilatation == 1.0f)
+              equally_split(n, num_threads + 1, borders);
+            else
+              {
+                difference_type chunk_length =
+                    ((double)n /
+                    ((double)num_threads + Settings::partial_sum_dilatation)),
+                    borderstart = n - num_threads * chunk_length;
+                borders[0] = 0;
+                for (int i = 1; i < (num_threads + 1); i++)
+                  {
+                    borders[i] = borderstart;
+                    borderstart += chunk_length;
+                  }
+                borders[num_threads + 1] = n;
+              }
+
+            sums = static_cast<value_type*>(
+                ::operator new(sizeof(value_type) * num_threads));
+            OutputIterator target_end;
+          } //single
+
+        int iam = omp_get_thread_num();
+        if (iam == 0)
+          {
+            *result = *begin;
+            parallel_partial_sum_basecase(begin + 1, begin + borders[1],
+                          result + 1, bin_op, *begin);
+            sums[0] = *(result + borders[1] - 1);
+          }
+        else
+          {
+            sums[iam] = std::accumulate(begin + borders[iam] + 1,
+                            begin + borders[iam + 1],
+                            *(begin + borders[iam]),
+                            bin_op, __gnu_parallel::sequential_tag());
+          }
+
+#       pragma omp barrier
+
+#       pragma omp single
+          parallel_partial_sum_basecase(
+              sums + 1, sums + num_threads, sums + 1, bin_op, sums[0]);
+
+#       pragma omp barrier
+
+        // Still same team.
+        parallel_partial_sum_basecase(begin + borders[iam + 1],
+                      begin + borders[iam + 2],
+                      result + borders[iam + 1], bin_op,
+                      sums[iam]);
+      } //parallel
+
+    delete[] sums;
+    delete[] borders;

    return result + n;
  }

-  /** @brief Parallel partial sum front-end.
-   *  @param begin Begin iterator of input sequence.
-   *  @param end End iterator of input sequence.
-   *  @param result Begin iterator of output sequence.
-   *  @param bin_op Associative binary function.
-   *  @return End iterator of output sequence. */
-  template<typename InputIterator, typename OutputIterator, typename BinaryOperation>
+/** @brief Parallel partial sum front-end.
+  *  @param begin Begin iterator of input sequence.
+  *  @param end End iterator of input sequence.
+  *  @param result Begin iterator of output sequence.
+  *  @param bin_op Associative binary function.
+  *  @return End iterator of output sequence. */
+template<
+    typename InputIterator,
+    typename OutputIterator,
+    typename BinaryOperation>
  OutputIterator
  parallel_partial_sum(InputIterator begin, InputIterator end,
-		       OutputIterator result, BinaryOperation bin_op)
+                       OutputIterator result, BinaryOperation bin_op)
  {
-    _GLIBCXX_CALL(begin - end);
+    _GLIBCXX_CALL(begin - end)

    typedef std::iterator_traits<InputIterator> traits_type;
    typedef typename traits_type::value_type value_type;
@@ -179,18 +205,15 @@ namespace __gnu_parallel

    difference_type n = end - begin;

-    int num_threads = get_max_threads();
-
    switch (Settings::partial_sum_algorithm)
      {
      case Settings::LINEAR:
-	// Need an initial offset.
-	return parallel_partial_sum_linear(begin, end, result, bin_op,
-					   n, num_threads);
+        // Need an initial offset.
+        return parallel_partial_sum_linear(begin, end, result, bin_op, n);
      default:
-	// Partial_sum algorithm not implemented.
-	_GLIBCXX_PARALLEL_ASSERT(0);
-	return result + n;
+    // Partial_sum algorithm not implemented.
+        _GLIBCXX_PARALLEL_ASSERT(0);
+        return result + n;
      }
  }
 }

--- a/libstdc++-v3/include/parallel/partition.h
+++ b/libstdc++-v3/include/parallel/partition.h
--- a/libstdc++-v3/include/parallel/quicksort.h
+++ b/libstdc++-v3/include/parallel/quicksort.h
@@ -53,11 +53,17 @@ namespace __gnu_parallel
   *  this part.
   */
  template<typename RandomAccessIterator, typename Comparator>
-  inline typename std::iterator_traits<RandomAccessIterator>::difference_type
-  parallel_sort_qs_divide(RandomAccessIterator begin, RandomAccessIterator end,
-			  Comparator comp,
-			  typename std::iterator_traits<RandomAccessIterator>::difference_type pivot_rank,
-			  typename std::iterator_traits<RandomAccessIterator>::difference_type num_samples, thread_index_t num_threads)
+  inline
+  typename std::iterator_traits<RandomAccessIterator>::difference_type
+  parallel_sort_qs_divide(
+      RandomAccessIterator begin,
+      RandomAccessIterator end,
+      Comparator comp,
+      typename std::iterator_traits<RandomAccessIterator>::difference_type
+          pivot_rank,
+      typename std::iterator_traits<RandomAccessIterator>::difference_type
+          num_samples,
+      thread_index_t num_threads)
  {
    typedef std::iterator_traits<RandomAccessIterator> traits_type;
    typedef typename traits_type::value_type value_type;
@@ -65,20 +71,24 @@ namespace __gnu_parallel

    difference_type n = end - begin;
    num_samples = std::min(num_samples, n);
-    value_type* samples = static_cast<value_type*>(__builtin_alloca(sizeof(value_type) * num_samples));
+
+    // Allocate uninitialized, to avoid default constructor.
+    value_type* samples = static_cast<value_type*>(
+        operator new(num_samples * sizeof(value_type)));

    for (difference_type s = 0; s < num_samples; s++)
      {
-	const unsigned long long index = static_cast<unsigned long long>(s) 
-	  				 * n / num_samples;
-	samples[s] = begin[index];
+        const unsigned long long index = static_cast<unsigned long long>(s)
+                        * n / num_samples;
+        new(samples + s) value_type(begin[index]);
      }

    __gnu_sequential::sort(samples, samples + num_samples, comp);

    value_type& pivot = samples[pivot_rank * num_samples / n];

-    __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, pivot);
+    __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool>
+        pred(comp, pivot);
    difference_type split = parallel_partition(begin, end, pred, num_threads);

    return split;
@@ -93,7 +103,10 @@ namespace __gnu_parallel
   */
  template<typename RandomAccessIterator, typename Comparator>
  inline void
-  parallel_sort_qs_conquer(RandomAccessIterator begin, RandomAccessIterator end, Comparator comp, int num_threads)
+  parallel_sort_qs_conquer(RandomAccessIterator begin,
+                           RandomAccessIterator end,
+                           Comparator comp,
+                           thread_index_t num_threads)
  {
    typedef std::iterator_traits<RandomAccessIterator> traits_type;
    typedef typename traits_type::value_type value_type;
@@ -101,8 +114,8 @@ namespace __gnu_parallel

    if (num_threads <= 1)
      {
-	__gnu_sequential::sort(begin, end, comp);
-	return;
+        __gnu_sequential::sort(begin, end, comp);
+        return;
      }

    difference_type n = end - begin, pivot_rank;
@@ -110,24 +123,27 @@ namespace __gnu_parallel
    if (n <= 1)
      return;

-    thread_index_t num_processors_left;
+    thread_index_t num_threads_left;

    if ((num_threads % 2) == 1)
-      num_processors_left = num_threads / 2 + 1;
+      num_threads_left = num_threads / 2 + 1;
    else
-      num_processors_left = num_threads / 2;
+      num_threads_left = num_threads / 2;

-    pivot_rank = n * num_processors_left / num_threads;
+    pivot_rank = n * num_threads_left / num_threads;

-    difference_type split = parallel_sort_qs_divide(begin, end, comp, pivot_rank,
-Settings::sort_qs_num_samples_preset, num_threads);
+    difference_type split = parallel_sort_qs_divide(
+        begin, end, comp, pivot_rank,
+        Settings::sort_qs_num_samples_preset, num_threads);

 #pragma omp parallel sections
    {
 #pragma omp section
-      parallel_sort_qs_conquer(begin, begin + split, comp, num_processors_left);
+      parallel_sort_qs_conquer(begin, begin + split,
+                               comp, num_threads_left);
 #pragma omp section
-      parallel_sort_qs_conquer(begin + split, end, comp, num_threads - num_processors_left);
+      parallel_sort_qs_conquer(begin + split, end,
+                               comp, num_threads - num_threads_left);
    }
  }

@@ -143,9 +159,12 @@ Settings::sort_qs_num_samples_preset, num_threads);
   */
  template<typename RandomAccessIterator, typename Comparator>
  inline void
-  parallel_sort_qs(RandomAccessIterator begin, RandomAccessIterator end,
-		   Comparator comp,
-		   typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads)
+  parallel_sort_qs(
+      RandomAccessIterator begin,
+      RandomAccessIterator end,
+      Comparator comp,
+      typename std::iterator_traits<RandomAccessIterator>::difference_type n,
+      int num_threads)
  {
    _GLIBCXX_CALL(n)

@@ -165,12 +184,9 @@ Settings::sort_qs_num_samples_preset, num_threads);
    // Hard to avoid.
    omp_set_num_threads(num_threads);

-    bool old_nested = (omp_get_nested() != 0);
-    omp_set_nested(true);
    parallel_sort_qs_conquer(begin, begin + n, comp, num_threads);
-    omp_set_nested(old_nested);
  }

-}	//namespace __gnu_parallel
+} //namespace __gnu_parallel

 #endif
--- a/libstdc++-v3/include/parallel/random_shuffle.h
+++ b/libstdc++-v3/include/parallel/random_shuffle.h
--- a/libstdc++-v3/include/parallel/search.h
+++ b/libstdc++-v3/include/parallel/search.h
@@ -53,10 +53,10 @@ namespace __gnu_parallel
   *  @param length Length of sequence to search for.
   *  @param advances Returned offsets. 
   */
-  template<typename RandomAccessIterator, typename _DifferenceTp>
+template<typename RandomAccessIterator, typename _DifferenceTp>
  void
  calc_borders(RandomAccessIterator elements, _DifferenceTp length, 
-	       _DifferenceTp* off)
+              _DifferenceTp* off)
  {
    typedef _DifferenceTp difference_type;

@@ -66,9 +66,9 @@ namespace __gnu_parallel
    difference_type k = 0;
    for (difference_type j = 2; j <= length; j++)
      {
-	while ((k >= 0) && !(elements[k] == elements[j-1]))
-	  k = off[k];
-	off[j] = ++k;
+        while ((k >= 0) && !(elements[k] == elements[j-1]))
+          k = off[k];
+        off[j] = ++k;
      }
  }

@@ -81,11 +81,14 @@ namespace __gnu_parallel
   *  @param end2 End iterator of second sequence.
   *  @param pred Find predicate.
   *  @return Place of finding in first sequences. */
-  template<typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename Pred>
+template<
+    typename _RandomAccessIterator1,
+    typename _RandomAccessIterator2,
+    typename Pred>
  _RandomAccessIterator1
  search_template(_RandomAccessIterator1 begin1, _RandomAccessIterator1 end1,
-		  _RandomAccessIterator2 begin2, _RandomAccessIterator2 end2,
-		  Pred pred)
+                  _RandomAccessIterator2 begin2, _RandomAccessIterator2 end2,
+                  Pred pred)
  {
    typedef std::iterator_traits<_RandomAccessIterator1> traits_type;
    typedef typename traits_type::difference_type difference_type;
@@ -103,60 +106,71 @@ namespace __gnu_parallel

    // Where is first occurrence of pattern? defaults to end.
    difference_type result = (end1 - begin1);
+    difference_type *splitters;

    // Pattern too long.
    if (input_length < 0)
      return end1;

-    thread_index_t num_threads = std::max<difference_type>(1, std::min<difference_type>(input_length, __gnu_parallel::get_max_threads()));
-
    omp_lock_t result_lock;
    omp_init_lock(&result_lock);

-    difference_type borders[num_threads + 1];
-    __gnu_parallel::equally_split(input_length, num_threads, borders);
+    thread_index_t num_threads =
+        std::max<difference_type>(1,
+            std::min<difference_type>(input_length, get_max_threads()));

    difference_type advances[pattern_length];
    calc_borders(begin2, pattern_length, advances);

-#pragma omp parallel num_threads(num_threads)
-    {
-      thread_index_t iam = omp_get_thread_num();
-
-      difference_type start = borders[iam], stop = borders[iam + 1];
-
-      difference_type pos_in_pattern = 0;
-      bool found_pattern = false;
-
-      while (start <= stop && !found_pattern)
-	{
-	  // Get new value of result.
-#pragma omp flush(result)
-	  // No chance for this thread to find first occurrence.
-	  if (result < start)
-	    break;
-	  while (pred(begin1[start + pos_in_pattern], begin2[pos_in_pattern]))
-	    {
-	      ++pos_in_pattern;
-	      if (pos_in_pattern == pattern_length)
-		{
-		  // Found new candidate for result.
-                  omp_set_lock(&result_lock);
-		  result = std::min(result, start);
-                  omp_unset_lock(&result_lock);
-
-		  found_pattern = true;
-		  break;
-		}
-	    }
-	  // Make safe jump.
-	  start += (pos_in_pattern - advances[pos_in_pattern]);
-	  pos_in_pattern = (advances[pos_in_pattern] < 0) ? 0 : advances[pos_in_pattern];
-	}
-    }
+#   pragma omp parallel num_threads(num_threads)
+      {
+#       pragma omp single
+          {
+            num_threads = omp_get_num_threads();
+            splitters = new difference_type[num_threads + 1];
+            equally_split(input_length, num_threads, splitters);
+          }
+
+        thread_index_t iam = omp_get_thread_num();
+
+        difference_type start = splitters[iam], stop = splitters[iam + 1];
+
+        difference_type pos_in_pattern = 0;
+        bool found_pattern = false;
+
+        while (start <= stop && !found_pattern)
+          {
+            // Get new value of result.
+            #pragma omp flush(result)
+            // No chance for this thread to find first occurrence.
+            if (result < start)
+              break;
+            while (pred(begin1[start + pos_in_pattern],
+                         begin2[pos_in_pattern]))
+              {
+                ++pos_in_pattern;
+                if (pos_in_pattern == pattern_length)
+                  {
+                    // Found new candidate for result.
+                            omp_set_lock(&result_lock);
+                    result = std::min(result, start);
+                            omp_unset_lock(&result_lock);
+
+                    found_pattern = true;
+                    break;
+                  }
+              }
+            // Make safe jump.
+            start += (pos_in_pattern - advances[pos_in_pattern]);
+            pos_in_pattern =
+                (advances[pos_in_pattern] < 0) ? 0 : advances[pos_in_pattern];
+          }
+      } //parallel

    omp_destroy_lock(&result_lock);

+    delete[] splitters;
+
    // Return iterator on found element.
    return (begin1 + result);
  }

--- a/libstdc++-v3/include/parallel/set_operations.h
+++ b/libstdc++-v3/include/parallel/set_operations.h
--- a/libstdc++-v3/include/parallel/unique_copy.h
+++ b/libstdc++-v3/include/parallel/unique_copy.h
@@ -44,16 +44,19 @@
 namespace __gnu_parallel
 {

-  /** @brief Parallel std::unique_copy(), without explicit equality predicate.
-   *  @param first Begin iterator of input sequence.
-   *  @param last End iterator of input sequence.
-   *  @param result Begin iterator of result sequence.
-   *  @param binary_pred Equality predicate.
-   *  @return End iterator of result sequence. */
-  template<typename InputIterator, class OutputIterator, class BinaryPredicate>
+/** @brief Parallel std::unique_copy(), w/o explicit equality predicate.
+  *  @param first Begin iterator of input sequence.
+  *  @param last End iterator of input sequence.
+  *  @param result Begin iterator of result sequence.
+  *  @param binary_pred Equality predicate.
+  *  @return End iterator of result sequence. */
+template<
+    typename InputIterator,
+    class OutputIterator,
+    class BinaryPredicate>
  inline OutputIterator
  parallel_unique_copy(InputIterator first, InputIterator last,
-		       OutputIterator result, BinaryPredicate binary_pred)
+                       OutputIterator result, BinaryPredicate binary_pred)
  {
    _GLIBCXX_CALL(last - first)

@@ -62,126 +65,136 @@ namespace __gnu_parallel
    typedef typename traits_type::difference_type difference_type;

    difference_type size = last - first;
-    int num_threads = __gnu_parallel::get_max_threads();
-    difference_type counter[num_threads + 1];

    if (size == 0)
      return result;

    // Let the first thread process two parts.
-    difference_type borders[num_threads + 2];
-    __gnu_parallel::equally_split(size, num_threads + 1, borders);
+    difference_type *counter;
+    difference_type *borders;

+    thread_index_t num_threads = get_max_threads();
    // First part contains at least one element.
-#pragma omp parallel num_threads(num_threads)
-    {
-      int iam = omp_get_thread_num();
-
-      difference_type begin, end;
-
-      // Check for length without duplicates
-      // Needed for position in output
-      difference_type i = 0;
-      OutputIterator out = result;
-      if (iam == 0)
-	{
-	  begin = borders[0] + 1;	// == 1
-	  end = borders[iam + 1];
-
-	  i++;
-	  new (static_cast<void *>(&*out)) value_type(*first);
-	  out++;
-
-	  for (InputIterator iter = first + begin; iter < first + end; ++iter)
-	    {
-	      if (!binary_pred(*iter, *(iter-1)))
-		{
-		  i++;
-		  new (static_cast<void *>(&*out)) value_type(*iter);
-		  out++;
-		}
-	    }
-	}
+#   pragma omp parallel num_threads(num_threads)
+      {
+#       pragma omp single
+          {
+                num_threads = omp_get_num_threads();
+                borders = new difference_type[num_threads + 2];
+                equally_split(size, num_threads + 1, borders);
+                counter = new difference_type[num_threads + 1];
+          }
+
+        thread_index_t iam = omp_get_thread_num();
+
+        difference_type begin, end;
+
+        // Check for length without duplicates
+        // Needed for position in output
+        difference_type i = 0;
+        OutputIterator out = result;
+
+        if (iam == 0)
+        {
+          begin = borders[0] + 1;	// == 1
+          end = borders[iam + 1];
+
+          i++;
+          new (static_cast<void *>(&*out)) value_type(*first);
+          out++;
+
+          for (InputIterator iter = first + begin; iter < first + end; ++iter)
+            {
+              if (!binary_pred(*iter, *(iter-1)))
+                {
+                  i++;
+                  new (static_cast<void *>(&*out)) value_type(*iter);
+                  out++;
+                }
+            }
+        }
      else
-	{
-	  begin = borders[iam]; //one part
-	  end = borders[iam + 1];
-
-	  for (InputIterator iter = first + begin; iter < first + end; ++iter)
-	    {
-	      if (!binary_pred(*iter, *(iter-1)))
-		{
-		  i++;
-		}
-	    }
-	}
+        {
+          begin = borders[iam]; //one part
+          end = borders[iam + 1];
+
+          for (InputIterator iter = first + begin; iter < first + end; ++iter)
+            {
+              if (!binary_pred(*iter, *(iter-1)))
+                {
+                  i++;
+                }
+            }
+        }
      counter[iam] = i;

      // Last part still untouched.
      difference_type begin_output;

-#pragma omp barrier
+#     pragma omp barrier

      // Store result in output on calculated positions.
      begin_output = 0;

      if (iam == 0)
-	{
-	  for (int t = 0; t < num_threads; t++)
-	    begin_output += counter[t];
+        {
+          for (int t = 0; t < num_threads; t++)
+            begin_output += counter[t];

-	  i = 0;
+          i = 0;

-	  OutputIterator iter_out = result + begin_output;
+          OutputIterator iter_out = result + begin_output;

-	  begin = borders[num_threads];
-	  end = size;
+          begin = borders[num_threads];
+          end = size;

-	  for (InputIterator iter = first + begin; iter < first + end; ++iter)
-	    {
-	      if (iter == first || !binary_pred(*iter, *(iter-1)))
-		{
-		  i++;
-		  new (static_cast<void *>(&*iter_out)) value_type(*iter);
-		  iter_out++;
-		}
-	    }
+          for (InputIterator iter = first + begin; iter < first + end; ++iter)
+            {
+              if (iter == first || !binary_pred(*iter, *(iter-1)))
+                {
+                  i++;
+                  new (static_cast<void *>(&*iter_out)) value_type(*iter);
+                  iter_out++;
+                }
+            }

-	  counter[num_threads] = i;
-	}
+          counter[num_threads] = i;
+        }
      else
-	{
-	  for (int t = 0; t < iam; t++)
-	    begin_output += counter[t];
-
-	  OutputIterator iter_out = result + begin_output;
-	  for (InputIterator iter = first + begin; iter < first + end; ++iter)
-	    {
-	      if (!binary_pred(*iter, *(iter-1)))
-		{
-		  new (static_cast<void *> (&*iter_out)) value_type(*iter);
-		  iter_out++;
-		}
-	    }
-	}
+        {
+          for (int t = 0; t < iam; t++)
+            begin_output += counter[t];
+
+          OutputIterator iter_out = result + begin_output;
+          for (InputIterator iter = first + begin; iter < first + end; ++iter)
+            {
+              if (!binary_pred(*iter, *(iter-1)))
+                {
+                  new (static_cast<void *> (&*iter_out)) value_type(*iter);
+                  iter_out++;
+                }
+            }
+        }
    }

    difference_type end_output = 0;
    for (int t = 0; t < num_threads + 1; t++)
      end_output += counter[t];

+    delete[] borders;
+
    return result + end_output;
  }

-  /** @brief Parallel std::unique_copy(), without explicit equality predicate
-   *  @param first Begin iterator of input sequence.
-   *  @param last End iterator of input sequence.
-   *  @param result Begin iterator of result sequence.
-   *  @return End iterator of result sequence. */
-  template<typename InputIterator, class OutputIterator>
+/** @brief Parallel std::unique_copy(), without explicit equality predicate
+  *  @param first Begin iterator of input sequence.
+  *  @param last End iterator of input sequence.
+  *  @param result Begin iterator of result sequence.
+  *  @return End iterator of result sequence. */
+template<typename InputIterator, class OutputIterator>
  inline OutputIterator
  parallel_unique_copy(InputIterator first, InputIterator last,
-		       OutputIterator result)
+                       OutputIterator result)
  {
    typedef typename std::iterator_traits<InputIterator>::value_type value_type;


--- a/libstdc++-v3/include/parallel/workstealing.h
+++ b/libstdc++-v3/include/parallel/workstealing.h