re PR libstdc++/33893 ([parallel mode] Algorithms rely on omp_set_dynamic(false))

2007-11-22 Johannes Singler <singler@ira.uka.de> PR libstdc++/33893 * include/parallel/multiway_merge.h: made omp_dynamic-safe * include/parallel/workstealing.h: made omp_dynamic-safe * include/parallel/base.h: infrastructure, cleanup * include/parallel/par_loop.h: made omp_dynamic-safe * include/parallel/features.h: activate loser tree variant * include/parallel/quicksort.h: made omp_dynamic-safe * include/parallel/compiletime_settings.h: settings overridable * include/parallel/equally_split.h: made omp_dynamic-safe * include/parallel/omp_loop_static.h: made omp_dynamic-safe * include/parallel/random_shuffle.h: made omp_dynamic-safe * include/parallel/balanced_quicksort.h: made omp_dynamic-safe * include/parallel/set_operations.h: made omp_dynamic-safe * include/parallel/unique_copy.h: made omp_dynamic-safe * include/parallel/multiway_mergesort.h: made omp_dynamic-safe * include/parallel/search.h: made omp_dynamic-safe * include/parallel/partition.h: made omp_dynamic-safe * include/parallel/partial_sum.h: made omp_dynamic-safe * include/parallel/find.h: made omp_dynamic-safe * include/parallel/omp_loop.h: made omp_dynamic-safe * include/parallel/losertree.h: avoid default constructor From-SVN: r130347

re PR libstdc++/33893 ([parallel mode] Algorithms rely on omp_set_dynamic(false))
2007-11-22 Johannes Singler <singler@ira.uka.de> PR libstdc++/33893 * include/parallel/multiway_merge.h: made omp_dynamic-safe * include/parallel/workstealing.h: made omp_dynamic-safe * include/parallel/base.h: infrastructure, cleanup * include/parallel/par_loop.h: made omp_dynamic-safe * include/parallel/features.h: activate loser tree variant * include/parallel/quicksort.h: made omp_dynamic-safe * include/parallel/compiletime_settings.h: settings overridable * include/parallel/equally_split.h: made omp_dynamic-safe * include/parallel/omp_loop_static.h: made omp_dynamic-safe * include/parallel/random_shuffle.h: made omp_dynamic-safe * include/parallel/balanced_quicksort.h: made omp_dynamic-safe * include/parallel/set_operations.h: made omp_dynamic-safe * include/parallel/unique_copy.h: made omp_dynamic-safe * include/parallel/multiway_mergesort.h: made omp_dynamic-safe * include/parallel/search.h: made omp_dynamic-safe * include/parallel/partition.h: made omp_dynamic-safe * include/parallel/partial_sum.h: made omp_dynamic-safe * include/parallel/find.h: made omp_dynamic-safe * include/parallel/omp_loop.h: made omp_dynamic-safe * include/parallel/losertree.h: avoid default constructor From-SVN: r130347
e683ee2a · Johannes Singler · Johannes Singler · 7861a5ce · e683ee2a · e683ee2a
Commit e683ee2a authored Nov 22, 2007 by Johannes Singler Committed by Johannes Singler Nov 22, 2007
21 changed files
--- a/libstdc++-v3/ChangeLog
+++ b/libstdc++-v3/ChangeLog
+2007-11-22  Johannes Singler  <singler@ira.uka.de>
+
+	PR libstdc++/33893
+        * include/parallel/multiway_merge.h: made omp_dynamic-safe
+        * include/parallel/workstealing.h: made omp_dynamic-safe
+        * include/parallel/base.h: infrastructure, cleanup
+        * include/parallel/par_loop.h: made omp_dynamic-safe
+        * include/parallel/features.h: activate loser tree variant
+        * include/parallel/quicksort.h: made omp_dynamic-safe
+        * include/parallel/compiletime_settings.h: settings overridable
+        * include/parallel/equally_split.h: made omp_dynamic-safe
+        * include/parallel/omp_loop_static.h: made omp_dynamic-safe
+        * include/parallel/random_shuffle.h: made omp_dynamic-safe
+        * include/parallel/balanced_quicksort.h: made omp_dynamic-safe
+        * include/parallel/set_operations.h: made omp_dynamic-safe
+        * include/parallel/unique_copy.h: made omp_dynamic-safe
+        * include/parallel/multiway_mergesort.h: made omp_dynamic-safe
+        * include/parallel/search.h: made omp_dynamic-safe
+        * include/parallel/partition.h: made omp_dynamic-safe
+        * include/parallel/partial_sum.h: made omp_dynamic-safe
+        * include/parallel/find.h: made omp_dynamic-safe
+        * include/parallel/omp_loop.h: made omp_dynamic-safe
+        * include/parallel/losertree.h: avoid default constructor
+
 2007-11-21  Jonathan Wakely  <jwakely.gcc@gmail.com>

 	* docs/html/17_intro/C++STYLE: Fix typos.

--- a/libstdc++-v3/include/parallel/balanced_quicksort.h
+++ b/libstdc++-v3/include/parallel/balanced_quicksort.h
@@ -63,8 +63,8 @@

 namespace __gnu_parallel
 {
-  /** @brief Information local to one thread in the parallel quicksort run. */
-  template<typename RandomAccessIterator>
+/** @brief Information local to one thread in the parallel quicksort run. */
+template<typename RandomAccessIterator>
  struct QSBThreadLocal
  {
    typedef std::iterator_traits<RandomAccessIterator> traits_type;
@@ -94,29 +94,17 @@ namespace __gnu_parallel
    QSBThreadLocal(int queue_size) : leftover_parts(queue_size) { }
  };

-  /** @brief Initialize the thread local storage.
-   *  @param tls Array of thread-local storages.
-   *  @param queue_size Size of the work-stealing queue. */
-  template<typename RandomAccessIterator>
-  inline void
-  qsb_initialize(QSBThreadLocal<RandomAccessIterator>** tls, int queue_size)
-  {
-    int iam = omp_get_thread_num();
-    tls[iam] = new QSBThreadLocal<RandomAccessIterator>(queue_size);
-  }
-
-
-  /** @brief Balanced quicksort divide step.
+/** @brief Balanced quicksort divide step.
  *  @param begin Begin iterator of subsequence.
  *  @param end End iterator of subsequence.
  *  @param comp Comparator.
  *  @param num_threads Number of threads that are allowed to work on
  *  this part.
  *  @pre @c (end-begin)>=1 */
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
  inline typename std::iterator_traits<RandomAccessIterator>::difference_type
  qsb_divide(RandomAccessIterator begin, RandomAccessIterator end,
-	     Comparator comp, int num_threads)
+             Comparator comp, thread_index_t num_threads)
  {
    _GLIBCXX_PARALLEL_ASSERT(num_threads > 0);

@@ -124,13 +112,15 @@ namespace __gnu_parallel
    typedef typename traits_type::value_type value_type;
    typedef typename traits_type::difference_type difference_type;

-    RandomAccessIterator pivot_pos = median_of_three_iterators(begin, begin + (end - begin) / 2, end  - 1, comp);
+    RandomAccessIterator pivot_pos = median_of_three_iterators(
+        begin, begin + (end - begin) / 2, end  - 1, comp);

 #if defined(_GLIBCXX_ASSERTIONS)
    // Must be in between somewhere.
    difference_type n = end - begin;

-    _GLIBCXX_PARALLEL_ASSERT((!comp(*pivot_pos, *begin) && !comp(*(begin + n / 2), *pivot_pos))
+    _GLIBCXX_PARALLEL_ASSERT(
+           (!comp(*pivot_pos, *begin) && !comp(*(begin + n / 2), *pivot_pos))
        || (!comp(*pivot_pos, *begin) && !comp(*end, *pivot_pos))
        || (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*begin, *pivot_pos))
        || (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*end, *pivot_pos))
@@ -143,10 +133,12 @@ namespace __gnu_parallel
      std::swap(*pivot_pos, *(end - 1));
    pivot_pos = end - 1;

-    __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, *pivot_pos);
+    __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool>
+        pred(comp, *pivot_pos);

    // Divide, returning end - begin - 1 in the worst case.
-    difference_type split_pos = parallel_partition(begin, end - 1, pred, num_threads);
+    difference_type split_pos = parallel_partition(
+        begin, end - 1, pred, num_threads);

    // Swap back pivot to middle.
    std::swap(*(begin + split_pos), *pivot_pos);
@@ -163,18 +155,21 @@ namespace __gnu_parallel
    return split_pos;
  }

-  /** @brief Quicksort conquer step.
+/** @brief Quicksort conquer step.
  *  @param tls Array of thread-local storages.
  *  @param begin Begin iterator of subsequence.
  *  @param end End iterator of subsequence.
  *  @param comp Comparator.
  *  @param iam Number of the thread processing this function.
-   *  @param num_threads Number of threads that are allowed to work on this part. */
-  template<typename RandomAccessIterator, typename Comparator>
+  *  @param num_threads
+  *          Number of threads that are allowed to work on this part. */
+template<typename RandomAccessIterator, typename Comparator>
  inline void
  qsb_conquer(QSBThreadLocal<RandomAccessIterator>** tls,
              RandomAccessIterator begin, RandomAccessIterator end,
-	      Comparator comp, thread_index_t iam, thread_index_t num_threads)
+              Comparator comp,
+              thread_index_t iam, thread_index_t num_threads,
+              bool parent_wait)
  {
    typedef std::iterator_traits<RandomAccessIterator> traits_type;
    typedef typename traits_type::value_type value_type;
@@ -182,12 +177,12 @@ namespace __gnu_parallel

    difference_type n = end - begin;

-    if (num_threads <= 1 || n < 2)
+    if (num_threads <= 1 || n <= 1)
      {
        tls[iam]->initial.first  = begin;
        tls[iam]->initial.second = end;

-	qsb_local_sort_with_helping(tls, comp, iam);
+        qsb_local_sort_with_helping(tls, comp, iam, parent_wait);

        return;
      }
@@ -199,33 +194,55 @@ namespace __gnu_parallel
    _GLIBCXX_PARALLEL_ASSERT(0 <= split_pos && split_pos < (end - begin));
 #endif

-    thread_index_t num_threads_leftside = std::max<thread_index_t>(1, std::min<thread_index_t>(num_threads - 1, split_pos * num_threads / n));
+    thread_index_t num_threads_leftside =
+        std::max<thread_index_t>(1, std::min<thread_index_t>(
+                          num_threads - 1, split_pos * num_threads / n));

-#pragma omp atomic
+#   pragma omp atomic
    *tls[iam]->elements_leftover -= (difference_type)1;

    // Conquer step.
-#pragma omp parallel sections num_threads(2)
+#   pragma omp parallel num_threads(2)
+    {
+      bool wait;
+      if(omp_get_num_threads() < 2)
+        wait = false;
+      else
+        wait = parent_wait;
+
+#     pragma omp sections
        {
-#pragma omp section
-      qsb_conquer(tls, begin, begin + split_pos, comp, iam, num_threads_leftside);
+#         pragma omp section
+            {
+              qsb_conquer(tls, begin, begin + split_pos, comp,
+                          iam,
+                          num_threads_leftside,
+                          wait);
+              wait = parent_wait;
+            }
          // The pivot_pos is left in place, to ensure termination.
-#pragma omp section
+#         pragma omp section
+            {
              qsb_conquer(tls, begin + split_pos + 1, end, comp,
-		  iam + num_threads_leftside, num_threads - num_threads_leftside);
+                          iam + num_threads_leftside,
+                          num_threads - num_threads_leftside,
+                          wait);
+              wait = parent_wait;
+            }
+        }
    }
  }

-  /** 
+/**
  *  @brief Quicksort step doing load-balanced local sort.
  *  @param tls Array of thread-local storages.
  *  @param comp Comparator.
  *  @param iam Number of the thread processing this function.
  */
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
  inline void
  qsb_local_sort_with_helping(QSBThreadLocal<RandomAccessIterator>** tls,
-			      Comparator& comp, int iam)
+                              Comparator& comp, int iam, bool wait)
  {
    typedef std::iterator_traits<RandomAccessIterator> traits_type;
    typedef typename traits_type::value_type value_type;
@@ -265,7 +282,9 @@ namespace __gnu_parallel
              std::swap(*pivot_pos, *(end - 1));
            pivot_pos = end - 1;

-	    __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, *pivot_pos);
+            __gnu_parallel::binder2nd
+                <Comparator, value_type, value_type, bool>
+                pred(comp, *pivot_pos);

            // Divide, leave pivot unchanged in last place.
            RandomAccessIterator split_pos1, split_pos2;
@@ -286,16 +305,19 @@ namespace __gnu_parallel
              {
                // Very unequal split, one part smaller than one 128th
                // elements not strictly larger than the pivot.
-		__gnu_parallel::unary_negate<__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>, value_type> pred(__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>(comp, *pivot_pos));
+                __gnu_parallel::unary_negate<__gnu_parallel::binder1st
+                    <Comparator, value_type, value_type, bool>, value_type>
+                    pred(__gnu_parallel::binder1st
+                        <Comparator, value_type, value_type, bool>(
+                        comp, *pivot_pos));

                // Find other end of pivot-equal range.
-		split_pos2 = __gnu_sequential::partition(split_pos1 + 1, end, pred);
+                split_pos2 = __gnu_sequential::partition(
+                    split_pos1 + 1, end, pred);
              }
            else
-	      {
              // Only skip the pivot.
              split_pos2 = split_pos1 + 1;
-	      }

            // Elements equal to pivot are done.
            elements_done += (split_pos2 - split_pos1);
@@ -317,7 +339,8 @@ namespace __gnu_parallel
              {
                // Left side larger.
                if (begin != split_pos1)
-		  tl.leftover_parts.push_front(std::make_pair(begin, split_pos1));
+                  tl.leftover_parts.push_front(
+                      std::make_pair(begin, split_pos1));

                current.first = split_pos2;
                //current.second = end;	//already set anyway
@@ -336,8 +359,9 @@ namespace __gnu_parallel
            if (tl.leftover_parts.pop_front(current))
              continue;

-#pragma omp atomic
+#           pragma omp atomic
            *tl.elements_leftover -= elements_done;
+
            elements_done = 0;

 #if _GLIBCXX_ASSERTIONS
@@ -345,8 +369,8 @@ namespace __gnu_parallel
 #endif

            // Look for new work.
-	    bool success = false;
-	    while (*tl.elements_leftover > 0 && !success
+            bool successfully_stolen = false;
+            while (wait && *tl.elements_leftover > 0 && !successfully_stolen
 #if _GLIBCXX_ASSERTIONS
              // Possible dead-lock.
              && (omp_get_wtime() < (search_start + 1.0))
@@ -357,11 +381,12 @@ namespace __gnu_parallel
                victim = rng(num_threads);

                // Large pieces.
-		success = (victim != iam) && tls[victim]->leftover_parts.pop_back(current);
-		if (!success)
+                successfully_stolen = (victim != iam)
+                    && tls[victim]->leftover_parts.pop_back(current);
+                if (!successfully_stolen)
                  yield();
 #if !defined(__ICC) && !defined(__ECC)
-#pragma omp flush
+#               pragma omp flush
 #endif
              }

@@ -369,10 +394,11 @@ namespace __gnu_parallel
            if (omp_get_wtime() >= (search_start + 1.0))
              {
                sleep(1);
-		_GLIBCXX_PARALLEL_ASSERT(omp_get_wtime() < (search_start + 1.0));
+                _GLIBCXX_PARALLEL_ASSERT(
+                    omp_get_wtime() < (search_start + 1.0));
              }
 #endif
-	    if (!success)
+            if (!successfully_stolen)
              {
 #if _GLIBCXX_ASSERTIONS
                _GLIBCXX_PARALLEL_ASSERT(*tl.elements_leftover == 0);
@@ -383,7 +409,7 @@ namespace __gnu_parallel
      }
  }

-  /** @brief Top-level quicksort routine.
+/** @brief Top-level quicksort routine.
  *  @param begin Begin iterator of sequence.
  *  @param end End iterator of sequence.
  *  @param comp Comparator.
@@ -391,11 +417,13 @@ namespace __gnu_parallel
  *  @param num_threads Number of threads that are allowed to work on
  *  this part.
  */
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
  inline void
  parallel_sort_qsb(RandomAccessIterator begin, RandomAccessIterator end,
                    Comparator comp,
-		    typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads)
+                    typename std::iterator_traits<RandomAccessIterator>
+                        ::difference_type n,
+                    thread_index_t num_threads)
  {
    _GLIBCXX_CALL(end - begin)

@@ -413,11 +441,11 @@ namespace __gnu_parallel
    if (num_threads > n)
      num_threads = static_cast<thread_index_t>(n);

+    // Initialize thread local storage
    tls_type** tls = new tls_type*[num_threads];
-
-#pragma omp parallel num_threads(num_threads)
-    // Initialize variables per processor.
-    qsb_initialize(tls, num_threads * (thread_index_t)(log2(n) + 1));
+    difference_type queue_size = num_threads * (thread_index_t)(log2(n) + 1);
+    for (thread_index_t t = 0; t < num_threads; ++t)
+      tls[t] = new QSBThreadLocal<RandomAccessIterator>(queue_size);

    // There can never be more than ceil(log2(n)) ranges on the stack, because
    // 1. Only one processor pushes onto the stack
@@ -434,14 +462,8 @@ namespace __gnu_parallel
        tls[i]->initial = std::make_pair(end, end);
      }

-    // Initial splitting, recursively.
-    int old_nested = omp_get_nested();
-    omp_set_nested(true);
-
    // Main recursion call.
-    qsb_conquer(tls, begin, begin + n, comp, 0, num_threads);
-
-    omp_set_nested(old_nested);
+    qsb_conquer(tls, begin, begin + n, comp, 0, num_threads, true);

 #if _GLIBCXX_ASSERTIONS
    // All stack must be empty.

--- a/libstdc++-v3/include/parallel/base.h
+++ b/libstdc++-v3/include/parallel/base.h
@@ -49,11 +49,11 @@ namespace __gnu_parallel
  // XXX remove std::duplicates from here if possible,
  // XXX but keep minimal dependencies.

-  /** @brief Calculates the rounded-down logarithm of @c n for base 2.
+/** @brief Calculates the rounded-down logarithm of @c n for base 2.
  *  @param n Argument.
  *  @return Returns 0 for argument 0.
  */
-  template<typename Size> 
+template<typename Size>
  inline Size
  log2(Size n)
    {
@@ -63,7 +63,7 @@ namespace __gnu_parallel
      return k;
    }

-  /** @brief Encode two integers into one __gnu_parallel::lcas_t.
+/** @brief Encode two integers into one __gnu_parallel::lcas_t.
  *  @param a First integer, to be encoded in the most-significant @c
  *  lcas_t_bits/2 bits.
  *  @param b Second integer, to be encoded in the least-significant
@@ -71,13 +71,13 @@ namespace __gnu_parallel
  *  @return __gnu_parallel::lcas_t value encoding @c a and @c b.
  *  @see decode2
  */
-  inline lcas_t
-  encode2(int a, int b)	//must all be non-negative, actually
-  {
+inline lcas_t
+encode2(int a, int b)	//must all be non-negative, actually
+{
  return (((lcas_t)a) << (lcas_t_bits / 2)) | (((lcas_t)b) << 0);
-  }
+}

-  /** @brief Decode two integers from one __gnu_parallel::lcas_t.
+/** @brief Decode two integers from one __gnu_parallel::lcas_t.
  *  @param x __gnu_parallel::lcas_t to decode integers from.
  *  @param a First integer, to be decoded from the most-significant
  *  @c lcas_t_bits/2 bits of @c x.
@@ -85,18 +85,34 @@ namespace __gnu_parallel
  *  @c lcas_t_bits/2 bits of @c x.
  *  @see encode2
  */
-  inline void
-  decode2(lcas_t x, int& a, int& b)
-  {
+inline void
+decode2(lcas_t x, int& a, int& b)
+{
  a = (int)((x >> (lcas_t_bits / 2)) & lcas_t_mask);
  b = (int)((x >>               0 ) & lcas_t_mask);
-  }
+}
+
+/** @brief Equivalent to std::min. */
+template<typename T>
+  const T&
+  min(const T& a, const T& b)
+  {
+    return (a < b) ? a : b;
+  };
+
+/** @brief Equivalent to std::max. */
+template<typename T>
+  const T&
+  max(const T& a, const T& b)
+  {
+    return (a > b) ? a : b;
+  };

-  /** @brief Constructs predicate for equality from strict weak
+/** @brief Constructs predicate for equality from strict weak
  *  ordering predicate
  */
-  // XXX comparator at the end, as per others
-  template<typename Comparator, typename T1, typename T2>
+// XXX comparator at the end, as per others
+template<typename Comparator, typename T1, typename T2>
  class equal_from_less : public std::binary_function<T1, T2, bool>
  {
  private:
@@ -112,8 +128,9 @@ namespace __gnu_parallel
  };


-  /** @brief Similar to std::binder1st, but giving the argument types explicitly. */
-  template<typename _Predicate, typename argument_type>
+/** @brief Similar to std::binder1st,
+  *  but giving the argument types explicitly. */
+template<typename _Predicate, typename argument_type>
  class unary_negate
  : public std::unary_function<argument_type, bool>
  {
@@ -129,8 +146,13 @@ namespace __gnu_parallel
    { return !_M_pred(__x); }
  };

-  /** @brief Similar to std::binder1st, but giving the argument types explicitly. */
-  template<typename _Operation, typename first_argument_type, typename second_argument_type, typename result_type>
+/** @brief Similar to std::binder1st,
+  *  but giving the argument types explicitly. */
+template<
+    typename _Operation,
+    typename first_argument_type,
+    typename second_argument_type,
+    typename result_type>
  class binder1st
  : public std::unary_function<second_argument_type, result_type>
  {
@@ -154,11 +176,15 @@ namespace __gnu_parallel
    { return op(value, __x); }
  };

-  /** 
+/**
  *  @brief Similar to std::binder2nd, but giving the argument types
  *  explicitly.
  */
-  template<typename _Operation, typename first_argument_type, typename second_argument_type, typename result_type>
+template<
+    typename _Operation,
+    typename first_argument_type,
+    typename second_argument_type,
+    typename result_type>
  class binder2nd
  : public std::unary_function<first_argument_type, result_type>
  {
@@ -182,16 +208,16 @@ namespace __gnu_parallel
    { return op(__x, value); }
  };

-  /** @brief Similar to std::equal_to, but allows two different types. */
-  template<typename T1, typename T2>
+/** @brief Similar to std::equal_to, but allows two different types. */
+template<typename T1, typename T2>
  struct equal_to : std::binary_function<T1, T2, bool>
  {
    bool operator()(const T1& t1, const T2& t2) const
    { return t1 == t2; }
  };

-  /** @brief Similar to std::less, but allows two different types. */
-  template<typename T1, typename T2>
+/** @brief Similar to std::less, but allows two different types. */
+template<typename T1, typename T2>
  struct less : std::binary_function<T1, T2, bool>
  {
    bool
@@ -203,9 +229,9 @@ namespace __gnu_parallel
    { return t2 < t1; }
  };

-  // Partial specialization for one type. Same as std::less.
-  template<typename _Tp>
-  struct less<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, bool>
+// Partial specialization for one type. Same as std::less.
+template<typename _Tp>
+struct less<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, bool>
  {
    bool
    operator()(const _Tp& __x, const _Tp& __y) const
@@ -214,21 +240,23 @@ namespace __gnu_parallel


  /** @brief Similar to std::plus, but allows two different types. */
-  template<typename _Tp1, typename _Tp2>
+template<typename _Tp1, typename _Tp2>
  struct plus : public std::binary_function<_Tp1, _Tp2, _Tp1>
  {
-      typedef typeof(*static_cast<_Tp1*>(NULL) + *static_cast<_Tp2*>(NULL)) result;
+    typedef typeof(*static_cast<_Tp1*>(NULL)
+                    + *static_cast<_Tp2*>(NULL)) result;

    result
    operator()(const _Tp1& __x, const _Tp2& __y) const
    { return __x + __y; }
  };

-  // Partial specialization for one type. Same as std::plus.
-  template<typename _Tp>
+// Partial specialization for one type. Same as std::plus.
+template<typename _Tp>
  struct plus<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp>
  {
-      typedef typeof(*static_cast<_Tp*>(NULL) + *static_cast<_Tp*>(NULL)) result;
+    typedef typeof(*static_cast<_Tp*>(NULL)
+                    + *static_cast<_Tp*>(NULL)) result;

    result
    operator()(const _Tp& __x, const _Tp& __y) const
@@ -236,22 +264,24 @@ namespace __gnu_parallel
  };


-  /** @brief Similar to std::multiplies, but allows two different types. */
-  template<typename _Tp1, typename _Tp2>
+/** @brief Similar to std::multiplies, but allows two different types. */
+template<typename _Tp1, typename _Tp2>
  struct multiplies : public std::binary_function<_Tp1, _Tp2, _Tp1>
  {
-      typedef typeof(*static_cast<_Tp1*>(NULL) * *static_cast<_Tp2*>(NULL)) result;
+    typedef typeof(*static_cast<_Tp1*>(NULL)
+                    * *static_cast<_Tp2*>(NULL)) result;

    result
    operator()(const _Tp1& __x, const _Tp2& __y) const
    { return __x * __y; }
  };

-  // Partial specialization for one type. Same as std::multiplies.
-  template<typename _Tp>
+// Partial specialization for one type. Same as std::multiplies.
+template<typename _Tp>
  struct multiplies<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp>
  {
-      typedef typeof(*static_cast<_Tp*>(NULL) * *static_cast<_Tp*>(NULL)) result;
+    typedef typeof(*static_cast<_Tp*>(NULL)
+                    * *static_cast<_Tp*>(NULL)) result;

    result
    operator()(const _Tp& __x, const _Tp& __y) const
@@ -259,15 +289,15 @@ namespace __gnu_parallel
  };


-  template<typename T, typename _DifferenceTp>
+template<typename T, typename _DifferenceTp>
  class pseudo_sequence;

-  /** @brief Iterator associated with __gnu_parallel::pseudo_sequence.
+/** @brief Iterator associated with __gnu_parallel::pseudo_sequence.
  *  If features the usual random-access iterator functionality.
  *  @param T Sequence value type.
  *  @param difference_type Sequence difference type.
  */
-  template<typename T, typename _DifferenceTp>
+template<typename T, typename _DifferenceTp>
  class pseudo_sequence_iterator
  {
  public:
@@ -317,13 +347,13 @@ namespace __gnu_parallel
    { return pos - i2.pos; }
  };

-  /** @brief Sequence that conceptually consists of multiple copies of
+/** @brief Sequence that conceptually consists of multiple copies of
    the same element.
  *  The copies are not stored explicitly, of course.
  *  @param T Sequence value type.
  *  @param difference_type Sequence difference type.
  */
-  template<typename T, typename _DifferenceTp>
+template<typename T, typename _DifferenceTp>
  class pseudo_sequence
  {
    typedef pseudo_sequence<T, _DifferenceTp> type;
@@ -356,23 +386,23 @@ namespace __gnu_parallel
    difference_type count;
  };

-  /** @brief Functor that does nothing */
-  template<typename _ValueTp>
+/** @brief Functor that does nothing */
+template<typename _ValueTp>
  class void_functor
  {
    inline void
    operator()(const _ValueTp& v) const { }
  };

-  /** @brief Compute the median of three referenced elements,
+/** @brief Compute the median of three referenced elements,
    according to @c comp.
  *  @param a First iterator.
  *  @param b Second iterator.
  *  @param c Third iterator.
  *  @param comp Comparator.
  */
-  template<typename RandomAccessIterator, typename Comparator>
-  RandomAccessIterator
+template<typename RandomAccessIterator, typename Comparator>
+RandomAccessIterator
  median_of_three_iterators(RandomAccessIterator a, RandomAccessIterator b,
                            RandomAccessIterator c, Comparator& comp)
  {
@@ -397,19 +427,19 @@ namespace __gnu_parallel
      }
  }

-  // Avoid the use of assert, because we're trying to keep the <cassert>
-  // include out of the mix. (Same as debug mode).
-  inline void
-  __replacement_assert(const char* __file, int __line, 
+// Avoid the use of assert, because we're trying to keep the <cassert>
+// include out of the mix. (Same as debug mode).
+inline void
+__replacement_assert(const char* __file, int __line,
                     const char* __function, const char* __condition)
-  {
+{
  std::printf("%s:%d: %s: Assertion '%s' failed.\n", __file, __line,
              __function, __condition);
  __builtin_abort();
-  }
+}

 #define _GLIBCXX_PARALLEL_ASSERT(_Condition)                            \
-  do 								        \
+do 								        \
  {									\
    if (!(_Condition))						\
      __gnu_parallel::__replacement_assert(__FILE__, __LINE__,	\
@@ -419,4 +449,3 @@ namespace __gnu_parallel
 } //namespace __gnu_parallel

 #endif
-
--- a/libstdc++-v3/include/parallel/compiletime_settings.h
+++ b/libstdc++-v3/include/parallel/compiletime_settings.h
@@ -39,7 +39,7 @@
 #include <cstdio>

 /** @brief Determine verbosity level of the parallel mode.
- *  Level 1 prints a message each time when entering a parallel-mode function. */
+ *  Level 1 prints a message each time a parallel-mode function is entered. */
 #define _GLIBCXX_VERBOSE_LEVEL 0

 /** @def _GLIBCXX_CALL
@@ -50,27 +50,40 @@
 #define _GLIBCXX_CALL(n)
 #endif
 #if (_GLIBCXX_VERBOSE_LEVEL == 1)
-#define _GLIBCXX_CALL(n) printf("   %s:\niam = %d, n = %ld, num_threads = %d\n", __PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads());
+#define _GLIBCXX_CALL(n) \
+  printf("   %s:\niam = %d, n = %ld, num_threads = %d\n", \
+  __PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads());
 #endif

+#ifndef _GLIBCXX_SCALE_DOWN_FPU
 /** @brief Use floating-point scaling instead of modulo for mapping
 *  random numbers to a range.  This can be faster on certain CPUs. */
 #define _GLIBCXX_SCALE_DOWN_FPU 0
+#endif

+#ifndef _GLIBCXX_ASSERTIONS
 /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
 *  Should be switched on only locally. */
 #define _GLIBCXX_ASSERTIONS 0
+#endif

+#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
 /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
- *  Consider the size of the L1 cache for __gnu_parallel::parallel_random_shuffle(). */
+ *  Consider the size of the L1 cache for
+ *  __gnu_parallel::parallel_random_shuffle(). */
 #define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 0
+#endif
+#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 
 /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
- *  Consider the size of the TLB for __gnu_parallel::parallel_random_shuffle(). */
+ *  Consider the size of the TLB for
+ *  __gnu_parallel::parallel_random_shuffle(). */
 #define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 0
+#endif

+#ifndef _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
 /** @brief First copy the data, sort it locally, and merge it back
 * (0); or copy it back after everything is done (1).
 *
 *  Recommendation: 0 */
 #define _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST 0
-
+#endif
--- a/libstdc++-v3/include/parallel/equally_split.h
+++ b/libstdc++-v3/include/parallel/equally_split.h
@@ -39,30 +39,58 @@

 namespace __gnu_parallel
 {
-  /** @brief Function to split a sequence into parts of almost equal size.
+/** @brief Function to split a sequence into parts of almost equal size.
 *
-   *  The resulting sequence s of length p+1 contains the splitting
+ *  The resulting sequence s of length num_threads+1 contains the splitting
 *  positions when splitting the range [0,n) into parts of almost
 *  equal size (plus minus 1).  The first entry is 0, the last one
 *  n. There may result empty parts.
 *  @param n Number of elements
-   *  @param p Number of parts
+ *  @param num_threads Number of parts
 *  @param s Splitters
-   *  @returns End of splitter sequence, i. e. @c s+p+1 */
-  template<typename _DifferenceTp, typename OutputIterator>
+ *  @returns End of splitter sequence, i. e. @c s+num_threads+1 */
+template<typename difference_type, typename OutputIterator>
  OutputIterator
-  equally_split(_DifferenceTp n, thread_index_t p, OutputIterator s)
+  equally_split(difference_type n,
+                thread_index_t num_threads,
+                OutputIterator s)
  {
-    typedef _DifferenceTp difference_type;
-    difference_type chunk_length = n / p, split = n % p, start = 0;
-    for (int i = 0; i < p; i++)
+    difference_type chunk_length = n / num_threads,
+                    num_longer_chunks = n % num_threads,
+                    pos = 0;
+    for (thread_index_t i = 0; i < num_threads; ++i)
      {
-	*s++ = start;
-	start += (difference_type(i) < split) ? (chunk_length + 1) : chunk_length;
+        *s++ = pos;
+        pos += (i < num_longer_chunks) ? (chunk_length + 1) : chunk_length;
      }
    *s++ = n;
    return s;
  }
+
+
+/** @brief Function to split a sequence into parts of almost equal size.
+ *
+ *  Returns the position of the splitting point between
+ *  thread number thread_no (included) and
+ *  thread number thread_no+1 (excluded).
+ *  @param n Number of elements
+ *  @param num_threads Number of parts
+ *  @returns Splitting point */
+template<typename difference_type>
+  difference_type
+  equally_split_point(difference_type n,
+                      thread_index_t num_threads,
+                      thread_index_t thread_no)
+  {
+    difference_type chunk_length = n / num_threads,
+                    num_longer_chunks = n % num_threads;
+
+    if(thread_no < num_longer_chunks)
+      return thread_no * (chunk_length + 1);
+    else
+      return num_longer_chunks * (chunk_length + 1)
+          + (thread_no - num_longer_chunks) * chunk_length;
+  }
 }

 #endif
--- a/libstdc++-v3/include/parallel/features.h
+++ b/libstdc++-v3/include/parallel/features.h
@@ -66,7 +66,7 @@
 *  @brief Include guarded (sequences may run empty) loser tree,
 *  moving objects.
 *  @see __gnu_parallel::Settings multiway_merge_algorithm */
-#define _GLIBCXX_LOSER_TREE 0
+#define _GLIBCXX_LOSER_TREE 1
 #endif

 #ifndef _GLIBCXX_LOSER_TREE_EXPLICIT

--- a/libstdc++-v3/include/parallel/find.h
+++ b/libstdc++-v3/include/parallel/find.h
@@ -10,7 +10,7 @@

 // This library is distributed in the hope that it will be useful, but
 // WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURstartE.  See the GNU
 // General Public License for more details.

 // You should have received a copy of the GNU General Public License
@@ -48,7 +48,7 @@

 namespace __gnu_parallel
 {
-  /** 
+/**
 *  @brief Parallel std::find, switch for different algorithms.
 *  @param begin1 Begin iterator of first sequence.
 *  @param end1 End iterator of first sequence.
@@ -58,7 +58,11 @@ namespace __gnu_parallel
 *  @param selector Functionality (e. g. std::find_if (), std::equal(),...)
 *  @return Place of finding in both sequences.
 */
-  template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector>
+template<
+    typename RandomAccessIterator1,
+    typename RandomAccessIterator2,
+    typename Pred,
+    typename Selector>
  std::pair<RandomAccessIterator1, RandomAccessIterator2>
  find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1,
                RandomAccessIterator2 begin2, Pred pred, Selector selector)
@@ -66,11 +70,14 @@ namespace __gnu_parallel
    switch (Settings::find_distribution)
      {
      case Settings::GROWING_BLOCKS:
-	return find_template(begin1, end1, begin2, pred, selector, growing_blocks_tag());
+        return find_template(begin1, end1, begin2, pred, selector,
+                            growing_blocks_tag());
      case Settings::CONSTANT_SIZE_BLOCKS:
-	return find_template(begin1, end1, begin2, pred, selector, constant_size_blocks_tag());
+        return find_template(begin1, end1, begin2, pred, selector,
+                            constant_size_blocks_tag());
      case Settings::EQUAL_SPLIT:
-	return find_template(begin1, end1, begin2, pred, selector, equal_split_tag());
+        return find_template(begin1, end1, begin2, pred, selector,
+                            equal_split_tag());
      default:
        _GLIBCXX_PARALLEL_ASSERT(false);
        return std::make_pair(begin1, begin2);
@@ -79,7 +86,7 @@ namespace __gnu_parallel

 #if _GLIBCXX_FIND_EQUAL_SPLIT

-  /** 
+/**
 *  @brief Parallel std::find, equal splitting variant.
 *  @param begin1 Begin iterator of first sequence.
 *  @param end1 End iterator of first sequence.
@@ -89,9 +96,18 @@ namespace __gnu_parallel
 *  @param selector Functionality (e. g. std::find_if (), std::equal(),...)
 *  @return Place of finding in both sequences.
 */
-  template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector>
+template<
+    typename RandomAccessIterator1,
+    typename RandomAccessIterator2,
+    typename Pred,
+    typename Selector>
  std::pair<RandomAccessIterator1, RandomAccessIterator2>
-  find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1, RandomAccessIterator2 begin2, Pred pred, Selector selector, equal_split_tag)
+  find_template(RandomAccessIterator1 begin1,
+                RandomAccessIterator1 end1,
+                RandomAccessIterator2 begin2,
+                Pred pred,
+                Selector selector,
+                equal_split_tag)
  {
    _GLIBCXX_CALL(end1 - begin1)

@@ -100,27 +116,30 @@ namespace __gnu_parallel
    typedef typename traits_type::value_type value_type;

    difference_type length = end1 - begin1;
-
    difference_type result = length;
+    difference_type* borders;

-    const thread_index_t num_threads = get_max_threads();
    omp_lock_t result_lock;
    omp_init_lock(&result_lock);

-    difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 1)));
-
+    thread_index_t num_threads = get_max_threads();
+#   pragma omp parallel num_threads(num_threads)
+      {
+#       pragma omp single
+          {
+            num_threads = omp_get_num_threads();
+            borders = new difference_type[num_threads + 1];
            equally_split(length, num_threads, borders);
+          } //single

-#pragma omp parallel shared(result) num_threads(num_threads)
-    {
-      int iam = omp_get_thread_num();
-      difference_type pos = borders[iam], limit = borders[iam + 1];
+        thread_index_t iam = omp_get_thread_num();
+        difference_type start = borders[iam], stop = borders[iam + 1];

-      RandomAccessIterator1 i1 = begin1 + pos;
-      RandomAccessIterator2 i2 = begin2 + pos;
-      for (; pos < limit; pos++)
+        RandomAccessIterator1 i1 = begin1 + start;
+        RandomAccessIterator2 i2 = begin2 + start;
+        for (difference_type pos = start; pos < stop; ++pos)
          {
-#pragma omp flush(result)
+            #pragma omp flush(result)
            // Result has been set to something lower.
            if (result < pos)
              break;
@@ -128,25 +147,28 @@ namespace __gnu_parallel
            if (selector(i1, i2, pred))
              {
                omp_set_lock(&result_lock);
-              if (result > pos)
+                if (pos < result)
                  result = pos;
                omp_unset_lock(&result_lock);
                break;
              }
-          i1++;
-          i2++;
-        }
+            ++i1;
+            ++i2;
          }
+      } //parallel

    omp_destroy_lock(&result_lock);
-    return std::pair<RandomAccessIterator1, RandomAccessIterator2>(begin1 + result, begin2 + result);
+    delete[] borders;
+
+    return std::pair<RandomAccessIterator1, RandomAccessIterator2>(
+        begin1 + result, begin2 + result);
  }

 #endif

 #if _GLIBCXX_FIND_GROWING_BLOCKS

-  /** 
+/**
 *  @brief Parallel std::find, growing block size variant.
 *  @param begin1 Begin iterator of first sequence.
 *  @param end1 End iterator of first sequence.
@@ -168,7 +190,11 @@ namespace __gnu_parallel
 *     for CSB, the blocks are allocated in a predetermined manner,
 *     namely spacial round-robin.
 */
-  template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector>
+template<
+    typename RandomAccessIterator1,
+    typename RandomAccessIterator2,
+    typename Pred,
+    typename Selector>
  std::pair<RandomAccessIterator1, RandomAccessIterator2>
  find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1,
                RandomAccessIterator2 begin2, Pred pred, Selector selector,
@@ -182,39 +208,46 @@ namespace __gnu_parallel

    difference_type length = end1 - begin1;

-    difference_type sequential_search_size = std::min<difference_type>(length, Settings::find_sequential_search_size);
+    difference_type sequential_search_size = std::min<difference_type>(
+        length, Settings::find_sequential_search_size);

    // Try it sequentially first.
    std::pair<RandomAccessIterator1, RandomAccessIterator2> find_seq_result =
-      selector.sequential_algorithm(begin1, begin1 + sequential_search_size, begin2, pred);
+      selector.sequential_algorithm(
+          begin1, begin1 + sequential_search_size, begin2, pred);

    if (find_seq_result.first != (begin1 + sequential_search_size))
      return find_seq_result;

    // Index of beginning of next free block (after sequential find).
-    difference_type next_block_pos = sequential_search_size;
+    difference_type next_block_start = sequential_search_size;
    difference_type result = length;
-    const thread_index_t num_threads = get_max_threads();

    omp_lock_t result_lock;
    omp_init_lock(&result_lock);

-#pragma omp parallel shared(result) num_threads(num_threads)
+    thread_index_t num_threads = get_max_threads();
+#   pragma omp parallel shared(result) num_threads(num_threads)
      {
+#       pragma omp single
+          num_threads = omp_get_num_threads();
+
        // Not within first k elements -> start parallel.
        thread_index_t iam = omp_get_thread_num();

        difference_type block_size = Settings::find_initial_block_size;
-      difference_type start = fetch_and_add<difference_type>(&next_block_pos, block_size);
+        difference_type start =
+            fetch_and_add<difference_type>(&next_block_start, block_size);

        // Get new block, update pointer to next block.
-      difference_type stop = std::min<difference_type>(length, start + block_size);
+        difference_type stop =
+            std::min<difference_type>(length, start + block_size);

        std::pair<RandomAccessIterator1, RandomAccessIterator2> local_result;

        while (start < length)
          {
-#pragma omp flush(result)
+#           pragma omp flush(result)
            // Get new value of result.
            if (result < start)
              {
@@ -222,7 +255,8 @@ namespace __gnu_parallel
                break;
              }

-	  local_result = selector.sequential_algorithm(begin1 + start, begin1 + stop, begin2 + start, pred);
+            local_result = selector.sequential_algorithm(
+                begin1 + start, begin1 + stop, begin2 + start, pred);
            if (local_result.first != (begin1 + stop))
              {
                omp_set_lock(&result_lock);
@@ -231,30 +265,35 @@ namespace __gnu_parallel
                    result = local_result.first - begin1;

                    // Result cannot be in future blocks, stop algorithm.
-		  fetch_and_add<difference_type>(&next_block_pos, length);
+                    fetch_and_add<difference_type>(&next_block_start, length);
                  }
                  omp_unset_lock(&result_lock);
              }

-	  block_size = std::min<difference_type>(block_size * Settings::find_increasing_factor, Settings::find_maximum_block_size);
+            block_size = std::min<difference_type>(
+                block_size * Settings::find_increasing_factor,
+                Settings::find_maximum_block_size);

            // Get new block, update pointer to next block.
-	  start = fetch_and_add<difference_type>(&next_block_pos, block_size);
-	  stop = (length < (start + block_size)) ? length : (start + block_size);
-	}
+            start =
+                fetch_and_add<difference_type>(&next_block_start, block_size);
+            stop = (length < (start + block_size)) ?
+                        length : (start + block_size);
          }
+      } //parallel

    omp_destroy_lock(&result_lock);

    // Return iterator on found element.
-    return std::pair<RandomAccessIterator1, RandomAccessIterator2>(begin1 + result, begin2 + result);
+    return std::pair<RandomAccessIterator1, RandomAccessIterator2>(
+        begin1 + result, begin2 + result);
  }

 #endif

 #if _GLIBCXX_FIND_CONSTANT_SIZE_BLOCKS

-  /** 
+/**
 *   @brief Parallel std::find, constant block size variant.
 *  @param begin1 Begin iterator of first sequence.
 *  @param end1 End iterator of first sequence.
@@ -272,7 +311,11 @@ namespace __gnu_parallel
 *  blocks are allocated in a predetermined manner, namely spacial
 *  round-robin.
 */
-  template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector>
+template<
+    typename RandomAccessIterator1,
+    typename RandomAccessIterator2,
+    typename Pred,
+    typename Selector>
  std::pair<RandomAccessIterator1, RandomAccessIterator2>
  find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1,
                RandomAccessIterator2 begin2, Pred pred, Selector selector,
@@ -285,47 +328,52 @@ namespace __gnu_parallel

    difference_type length = end1 - begin1;

-    difference_type sequential_search_size = std::min<difference_type>(length, Settings::find_sequential_search_size);
+    difference_type sequential_search_size = std::min<difference_type>(
+        length, Settings::find_sequential_search_size);

    // Try it sequentially first.
    std::pair<RandomAccessIterator1, RandomAccessIterator2> find_seq_result =
-      selector.sequential_algorithm(begin1, begin1 + sequential_search_size, begin2, pred);
+      selector.sequential_algorithm(begin1, begin1 + sequential_search_size,
+                                    begin2, pred);

    if (find_seq_result.first != (begin1 + sequential_search_size))
      return find_seq_result;

    difference_type result = length;
-    const thread_index_t num_threads = get_max_threads();
-
    omp_lock_t result_lock;
    omp_init_lock(&result_lock);

    // Not within first sequential_search_size elements -> start parallel.
-#pragma omp parallel shared(result) num_threads(num_threads)
+
+    thread_index_t num_threads = get_max_threads();
+#   pragma omp parallel shared(result) num_threads(num_threads)
      {
+#       pragma omp single
+          num_threads = omp_get_num_threads();
+
        thread_index_t iam = omp_get_thread_num();
        difference_type block_size = Settings::find_initial_block_size;

-      difference_type start, stop;
-
        // First element of thread's current iteration.
        difference_type iteration_start = sequential_search_size;

        // Where to work (initialization).
-      start = iteration_start + iam * block_size;
-      stop = std::min<difference_type>(length, start + block_size);
+        difference_type start = iteration_start + iam * block_size;
+        difference_type stop =
+            std::min<difference_type>(length, start + block_size);

        std::pair<RandomAccessIterator1, RandomAccessIterator2> local_result;

        while (start < length)
          {
            // Get new value of result.
-#pragma omp flush(result)
+#           pragma omp flush(result)
            // No chance to find first element.
            if (result < start)
              break;
-
-	  local_result = selector.sequential_algorithm(begin1 + start, begin1 + stop, begin2 + start, pred);
+            local_result = selector.sequential_algorithm(
+                begin1 + start, begin1 + stop,
+                begin2 + start, pred);
            if (local_result.first != (begin1 + stop))
              {
                omp_set_lock(&result_lock);
@@ -342,15 +390,15 @@ namespace __gnu_parallel
            start = iteration_start + iam * block_size;
            stop = std::min<difference_type>(length, start + block_size);
          }
-    }
+      } //parallel

    omp_destroy_lock(&result_lock);

    // Return iterator on found element.
-    return std::pair<RandomAccessIterator1, RandomAccessIterator2>(begin1 + result, begin2 + result);
+    return std::pair<RandomAccessIterator1, RandomAccessIterator2>(
+        begin1 + result, begin2 + result);
  }
 #endif
 } // end namespace

 #endif
-
--- a/libstdc++-v3/include/parallel/losertree.h
+++ b/libstdc++-v3/include/parallel/losertree.h
@@ -29,9 +29,9 @@
 // Public License.

 /** @file parallel/losertree.h
- *  @brief Many generic loser tree variants.
- *  This file is a GNU parallel extension to the Standard C++ Library.
- */
+*  @brief Many generic loser tree variants.
+*  This file is a GNU parallel extension to the Standard C++ Library.
+*/

 // Written by Johannes Singler.

@@ -49,13 +49,13 @@ namespace __gnu_parallel

 #if _GLIBCXX_LOSER_TREE_EXPLICIT

-  /** @brief Guarded loser tree, copying the whole element into the
-   * tree structure.
-   *
-   *  Guarding is done explicitly through two flags per element, inf
-   *  and sup This is a quite slow variant.
-   */
-  template<typename T, typename Comparator = std::less<T> >
+/** @brief Guarded loser tree, copying the whole element into the
+* tree structure.
+*
+*  Guarding is done explicitly through two flags per element, inf
+*  and sup This is a quite slow variant.
+*/
+template<typename T, typename Comparator = std::less<T> >
  class LoserTreeExplicit
  {
  private:
@@ -76,7 +76,9 @@ namespace __gnu_parallel
    Comparator comp;

  public:
-    inline LoserTreeExplicit(unsigned int _size, Comparator _comp = std::less<T>()) : comp(_comp)
+    inline
+    LoserTreeExplicit(unsigned int _size, Comparator _comp = std::less<T>())
+      : comp(_comp)
    {
      size = _size;
      offset = size;
@@ -93,9 +95,6 @@ namespace __gnu_parallel
    inline ~LoserTreeExplicit()
    { delete[] losers; }

-    inline void
-    print() { }
-
    inline int
    get_min_source()
    { return losers[0].source; }
@@ -106,7 +105,8 @@ namespace __gnu_parallel
      bool inf = false;
      for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2)
        {
-	  if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup && comp(losers[pos].key, key)) || losers[pos].inf || sup)
+          if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup
+               && comp(losers[pos].key, key)) || losers[pos].inf || sup)
            {
              // The other one is smaller.
              std::swap(losers[pos].key, key);
@@ -209,14 +209,14 @@ namespace __gnu_parallel

 #if _GLIBCXX_LOSER_TREE

-  /** @brief Guarded loser tree, either copying the whole element into
-   * the tree structure, or looking up the element via the index.
-   *
-   *  Guarding is done explicitly through one flag sup per element,
-   *  inf is not needed due to a better initialization routine.  This
-   *  is a well-performing variant.
-   */
-  template<typename T, typename Comparator = std::less<T> >
+/** @brief Guarded loser tree, either copying the whole element into
+* the tree structure, or looking up the element via the index.
+*
+*  Guarding is done explicitly through one flag sup per element,
+*  inf is not needed due to a better initialization routine.  This
+*  is a well-performing variant.
+*/
+template<typename T, typename Comparator = std::less<T> >
  class LoserTree
  {
  private:
@@ -240,7 +240,7 @@ namespace __gnu_parallel
      // Next greater power of 2.
      k = 1 << (log2(ik - 1) + 1);
      offset = k;
-      losers = new Loser[k * 2];
+      losers = static_cast<Loser*>(::operator new(k * 2 * sizeof(Loser)));
      for (unsigned int i = ik - 1; i < k; i++)
        losers[i + k].sup = true;
    }
@@ -248,14 +248,6 @@ namespace __gnu_parallel
    inline ~LoserTree()
    { delete[] losers; }

-    void
-    print()
-    {
-      for (unsigned int i = 0; i < (k * 2); i++)
-	printf("%d    %d from %d,  %d\n", i, losers[i].key, 
-	       losers[i].source, losers[i].sup);
-    }
-
    inline int
    get_min_source()
    { return losers[0].source; }
@@ -267,7 +259,7 @@ namespace __gnu_parallel

      losers[pos].sup = sup;
      losers[pos].source = source;
-      losers[pos].key = key;
+      new(&(losers[pos].key)) T(key);
    }

    unsigned int
@@ -282,7 +274,8 @@ namespace __gnu_parallel
          unsigned int left = init_winner (2 * root);
          unsigned int right = init_winner (2 * root + 1);
          if (losers[right].sup ||
-	      (!losers[left].sup && !comp(losers[right].key, losers[left].key)))
+              (!losers[left].sup
+                && !comp(losers[right].key, losers[left].key)))
            {
              // Left one is less or equal.
              losers[root] = losers[right];
@@ -337,8 +330,9 @@ namespace __gnu_parallel
        {
          unsigned int left = init_winner (2 * root);
          unsigned int right = init_winner (2 * root + 1);
-	  if (	losers[right].sup ||
-		(!losers[left].sup && !comp(losers[right].key, losers[left].key)))
+          if (losers[right].sup
+              || (!losers[left].sup
+                && !comp(losers[right].key, losers[left].key)))
            {
              // Left one is less or equal.
              losers[root] = losers[right];
@@ -365,10 +359,11 @@ namespace __gnu_parallel
      for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
        {
          // The smaller one gets promoted, ties are broken by source.
-	  if (	(sup && (!losers[pos].sup || losers[pos].source < source)) ||
-		(!sup && !losers[pos].sup &&
-		 ((comp(losers[pos].key, key)) ||
-		  (!comp(key, losers[pos].key) && losers[pos].source < source))))
+          if (	(sup && (!losers[pos].sup || losers[pos].source < source))
+                || (!sup && !losers[pos].sup
+                  && ((comp(losers[pos].key, key))
+                    || (!comp(key, losers[pos].key)
+                      && losers[pos].source < source))))
            {
              // The other one is smaller.
              std::swap(losers[pos].sup, sup);
@@ -387,14 +382,14 @@ namespace __gnu_parallel

 #if _GLIBCXX_LOSER_TREE_REFERENCE

-  /** @brief Guarded loser tree, either copying the whole element into
-   * the tree structure, or looking up the element via the index.
-   *
-   *  Guarding is done explicitly through one flag sup per element,
-   *  inf is not needed due to a better initialization routine.  This
-   *  is a well-performing variant.
-   */
-  template<typename T, typename Comparator = std::less<T> >
+/** @brief Guarded loser tree, either copying the whole element into
+* the tree structure, or looking up the element via the index.
+*
+*  Guarding is done explicitly through one flag sup per element,
+*  inf is not needed due to a better initialization routine.  This
+*  is a well-performing variant.
+*/
+template<typename T, typename Comparator = std::less<T> >
  class LoserTreeReference
  {
 #undef COPY
@@ -423,7 +418,9 @@ namespace __gnu_parallel
    Comparator comp;

  public:
-    inline LoserTreeReference(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp)
+    inline
+    LoserTreeReference(unsigned int _k, Comparator _comp = std::less<T>())
+      : comp(_comp)
    {
      ik = _k;

@@ -446,13 +443,6 @@ namespace __gnu_parallel
 #endif
    }

-    void
-    print()
-    {
-      for (unsigned int i = 0; i < (k * 2); i++)
-	printf("%d    %d from %d,  %d\n", i, KEY(i), losers[i].source, losers[i].sup);
-    }
-
    inline int
    get_min_source()
    { return losers[0].source; }
@@ -570,7 +560,8 @@ namespace __gnu_parallel
          if (	(sup && (!losers[pos].sup || losers[pos].source < source)) ||
                (!sup && !losers[pos].sup &&
                ((comp(KEY(pos), KEY_SOURCE(source))) ||
-		  (!comp(KEY_SOURCE(source), KEY(pos)) && losers[pos].source < source))))
+                  (!comp(KEY_SOURCE(source), KEY(pos))
+                    && losers[pos].source < source))))
            {
              // The other one is smaller.
              std::swap(losers[pos].sup, sup);
@@ -595,13 +586,13 @@ namespace __gnu_parallel

 #if _GLIBCXX_LOSER_TREE_POINTER

-  /** @brief Guarded loser tree, either copying the whole element into
+/** @brief Guarded loser tree, either copying the whole element into
    the tree structure, or looking up the element via the index.
-   *  Guarding is done explicitly through one flag sup per element,
-   *  inf is not needed due to a better initialization routine.
-   *  This is a well-performing variant.
-   */
-  template<typename T, typename Comparator = std::less<T> >
+*  Guarding is done explicitly through one flag sup per element,
+*  inf is not needed due to a better initialization routine.
+*  This is a well-performing variant.
+*/
+template<typename T, typename Comparator = std::less<T> >
  class LoserTreePointer
  {
  private:
@@ -617,7 +608,9 @@ namespace __gnu_parallel
    Comparator comp;

  public:
-    inline LoserTreePointer(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp)
+    inline
+    LoserTreePointer(unsigned int _k, Comparator _comp = std::less<T>())
+      : comp(_comp)
    {
      ik = _k;

@@ -632,13 +625,6 @@ namespace __gnu_parallel
    inline ~LoserTreePointer()
    { delete[] losers; }

-    void
-    print()
-    {
-      for (unsigned int i = 0; i < (k * 2); i++)
-	printf("%d    %d from %d,  %d\n", i, losers[i].keyp, losers[i].source, losers[i].sup);
-    }
-
    inline int
    get_min_source()
    { return losers[0].source; }
@@ -664,8 +650,9 @@ namespace __gnu_parallel
        {
          unsigned int left = init_winner (2 * root);
          unsigned int right = init_winner (2 * root + 1);
-	  if (	losers[right].sup ||
-		(!losers[left].sup && !comp(*losers[right].keyp, *losers[left].keyp)))
+          if (losers[right].sup
+                || (!losers[left].sup
+                  && !comp(*losers[right].keyp, *losers[left].keyp)))
            {
              // Left one is less or equal.
              losers[root] = losers[right];
@@ -773,13 +760,13 @@ namespace __gnu_parallel

 #if _GLIBCXX_LOSER_TREE_UNGUARDED

-  /** @brief Unguarded loser tree, copying the whole element into the
-   * tree structure.
-   *
-   *  No guarding is done, therefore not a single input sequence must
-   *  run empty.  This is a very fast variant.
-   */
-  template<typename T, typename Comparator = std::less<T> >
+/** @brief Unguarded loser tree, copying the whole element into the
+* tree structure.
+*
+*  No guarding is done, therefore not a single input sequence must
+*  run empty.  This is a very fast variant.
+*/
+template<typename T, typename Comparator = std::less<T> >
  class LoserTreeUnguarded
  {
  private:
@@ -809,7 +796,9 @@ namespace __gnu_parallel
    }

  public:
-    inline LoserTreeUnguarded(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp)
+    inline
+    LoserTreeUnguarded(unsigned int _k, Comparator _comp = std::less<T>())
+      : comp(_comp)
    {
      ik = _k;
      // Next greater or equal power of 2.
@@ -826,13 +815,6 @@ namespace __gnu_parallel
      delete[] mapping;
    }

-    void
-    print()
-    {
-      for (unsigned int i = 0; i < k + ik; i++)
-	printf("%d    %d from %d\n", i, losers[i].key, losers[i].source);
-    }
-
    inline int
    get_min_source()
    { return losers[0].source; }
@@ -855,7 +837,8 @@ namespace __gnu_parallel
          // Next greater or equal power of 2.
          unsigned int division = 1 << (log2(end - begin - 1));
          unsigned int left = init_winner(2 * root, begin, begin + division);
-	  unsigned int right = init_winner(2 * root + 1, begin + division, end);
+          unsigned int right =
+                          init_winner(2 * root + 1, begin + division, end);
          if (!comp(losers[right].key, losers[left].key))
            {
              // Left one is less or equal.
@@ -912,7 +895,8 @@ namespace __gnu_parallel
        {
          // The smaller one gets promoted, ties are broken by source.
          if (comp(losers[pos].key, keyr)
-	      || (!comp(keyr, losers[pos].key) && losers[pos].source < source))
+              || (!comp(keyr, losers[pos].key)
+                && losers[pos].source < source))
            {
              // The other one is smaller.
              std::swap(losers[pos].source, source);
@@ -926,13 +910,13 @@ namespace __gnu_parallel

 #if _GLIBCXX_LOSER_TREE_POINTER_UNGUARDED

-  /** @brief Unguarded loser tree, keeping only pointers to the
-   * elements in the tree structure.
-   *
-   *  No guarding is done, therefore not a single input sequence must
-   *  run empty.  This is a very fast variant.
-   */
-  template<typename T, typename Comparator = std::less<T> >
+/** @brief Unguarded loser tree, keeping only pointers to the
+* elements in the tree structure.
+*
+*  No guarding is done, therefore not a single input sequence must
+*  run empty.  This is a very fast variant.
+*/
+template<typename T, typename Comparator = std::less<T> >
  class LoserTreePointerUnguarded
  {
  private:
@@ -961,7 +945,10 @@ namespace __gnu_parallel
    }

  public:
-    inline LoserTreePointerUnguarded(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp)
+    inline
+    LoserTreePointerUnguarded(unsigned int _k,
+                              Comparator _comp = std::less<T>())
+      : comp(_comp)
    {
      ik = _k;

@@ -979,13 +966,6 @@ namespace __gnu_parallel
      delete[] mapping;
    }

-    void
-    print()
-    {
-      for (unsigned int i = 0; i < k + ik; i++)
-	printf("%d    %d from %d\n", i, *losers[i].keyp, losers[i].source);
-    }
-
    inline int
    get_min_source()
    { return losers[0].source; }
@@ -1008,7 +988,8 @@ namespace __gnu_parallel
          // Next greater or equal power of 2.
          unsigned int division = 1 << (log2(end - begin - 1));
          unsigned int left = init_winner(2 * root, begin, begin + division);
-	  unsigned int right = init_winner(2 * root + 1, begin + division, end);
+          unsigned int right
+                          = init_winner(2 * root + 1, begin + division, end);
          if (!comp(*losers[right].keyp, *losers[left].keyp))
            {
              // Left one is less or equal.
@@ -1079,7 +1060,7 @@ namespace __gnu_parallel
  };
 #endif

-  template<typename _ValueTp, class Comparator>
+template<typename _ValueTp, class Comparator>
  struct loser_tree_traits
  {
 #if _GLIBCXX_LOSER_TREE
@@ -1093,7 +1074,7 @@ namespace __gnu_parallel
 #endif
  };

-  template<typename _ValueTp, class Comparator>
+template<typename _ValueTp, class Comparator>
  struct loser_tree_unguarded_traits
  {
 #if _GLIBCXX_LOSER_TREE_UNGUARDED

--- a/libstdc++-v3/include/parallel/multiway_merge.h
+++ b/libstdc++-v3/include/parallel/multiway_merge.h
@@ -29,16 +29,16 @@
 // Public License.

 /** @file parallel/multiway_merge.h
- *  @brief Implementation of sequential and parallel multiway merge.
- *
- *  Explanations on the high-speed merging routines in the appendix of
- *
- *  P. Sanders.
- *  Fast priority queues for cached memory.
- *  ACM Journal of Experimental Algorithmics, 5, 2000.
- *
- *  This file is a GNU parallel extension to the Standard C++ Library.
- */
+*  @brief Implementation of sequential and parallel multiway merge.
+*
+*  Explanations on the high-speed merging routines in the appendix of
+*
+*  P. Sanders.
+*  Fast priority queues for cached memory.
+*  ACM Journal of Experimental Algorithmics, 5, 2000.
+*
+*  This file is a GNU parallel extension to the Standard C++ Library.
+*/

 // Written by Johannes Singler.

@@ -62,15 +62,15 @@
 // XXX need iterator typedefs
 namespace __gnu_parallel
 {
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
  class guarded_iterator;

-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
  inline bool
  operator<(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
            guarded_iterator<RandomAccessIterator, Comparator>& bi2);

-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
  inline bool
  operator<=(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
            guarded_iterator<RandomAccessIterator, Comparator>& bi2);
@@ -80,7 +80,7 @@ namespace __gnu_parallel
      *  Deriving from RandomAccessIterator is not possible since
      *  RandomAccessIterator need not be a class.
      */
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
  class guarded_iterator
  {
  private:
@@ -125,17 +125,21 @@ namespace __gnu_parallel
    { return current; }

    friend bool
-    operator< <RandomAccessIterator, Comparator>(guarded_iterator<RandomAccessIterator, Comparator>& bi1, guarded_iterator<RandomAccessIterator, Comparator>& bi2);
+    operator< <RandomAccessIterator, Comparator>(
+        guarded_iterator<RandomAccessIterator, Comparator>& bi1,
+        guarded_iterator<RandomAccessIterator, Comparator>& bi2);

    friend bool
-    operator<= <RandomAccessIterator, Comparator>(guarded_iterator<RandomAccessIterator, Comparator>& bi1, guarded_iterator<RandomAccessIterator, Comparator>& bi2);
+    operator<= <RandomAccessIterator, Comparator>(
+        guarded_iterator<RandomAccessIterator, Comparator>& bi1,
+        guarded_iterator<RandomAccessIterator, Comparator>& bi2);
  };

-  /** @brief Compare two elements referenced by guarded iterators.
+/** @brief Compare two elements referenced by guarded iterators.
 *  @param bi1 First iterator.
 *  @param bi2 Second iterator.
 *  @return @c True if less. */
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
  inline bool
  operator<(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
            guarded_iterator<RandomAccessIterator, Comparator>& bi2)
@@ -147,11 +151,11 @@ namespace __gnu_parallel
    return (bi1.comp)(*bi1, *bi2);	//normal compare
  }

-  /** @brief Compare two elements referenced by guarded iterators.
+/** @brief Compare two elements referenced by guarded iterators.
 *  @param bi1 First iterator.
 *  @param bi2 Second iterator.
 *  @return @c True if less equal. */
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
  inline bool
  operator<=(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
            guarded_iterator<RandomAccessIterator, Comparator>& bi2)
@@ -163,20 +167,20 @@ namespace __gnu_parallel
    return !(bi1.comp)(*bi2, *bi1);	//normal compare
  }

-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
  class unguarded_iterator;

-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
  inline bool
  operator<(unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
            unguarded_iterator<RandomAccessIterator, Comparator>& bi2);

-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
  inline bool
  operator<=(unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
             unguarded_iterator<RandomAccessIterator, Comparator>& bi2);

-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
  class unguarded_iterator
  {
  private:
@@ -217,17 +221,21 @@ namespace __gnu_parallel
    { return current; }

    friend bool
-    operator< <RandomAccessIterator, Comparator>(unguarded_iterator<RandomAccessIterator, Comparator>& bi1, unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
+    operator< <RandomAccessIterator, Comparator>(
+        unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
+        unguarded_iterator<RandomAccessIterator, Comparator>& bi2);

    friend bool
-    operator<= <RandomAccessIterator, Comparator>(unguarded_iterator<RandomAccessIterator, Comparator>& bi1, unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
+    operator<= <RandomAccessIterator, Comparator>(
+        unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
+        unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
  };

-  /** @brief Compare two elements referenced by unguarded iterators.
+/** @brief Compare two elements referenced by unguarded iterators.
 *  @param bi1 First iterator.
 *  @param bi2 Second iterator.
 *  @return @c True if less. */
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
  inline bool
  operator<(unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
            unguarded_iterator<RandomAccessIterator, Comparator>& bi2)
@@ -236,11 +244,11 @@ namespace __gnu_parallel
    return (bi1.comp)(*bi1, *bi2);
  }

-  /** @brief Compare two elements referenced by unguarded iterators.
+/** @brief Compare two elements referenced by unguarded iterators.
 *  @param bi1 First iterator.
 *  @param bi2 Second iterator.
 *  @return @c True if less equal. */
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
  inline bool
  operator<=(unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
            unguarded_iterator<RandomAccessIterator, Comparator>& bi2)
@@ -249,26 +257,30 @@ namespace __gnu_parallel
    return !(bi1.comp)(*bi2, *bi1);
  }

-  /** Prepare a set of sequences to be merged without a (end) guard
+/** Prepare a set of sequences to be merged without a (end) guard
 *  @param seqs_begin
 *  @param seqs_end
 *  @param comp
 *  @param min_sequence
 *  @param stable
 *  @pre (seqs_end - seqs_begin > 0) */
-  template<typename RandomAccessIteratorIterator, typename Comparator>
-  typename std::iterator_traits<typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type>::difference_type
+template<typename RandomAccessIteratorIterator, typename Comparator>
+  typename std::iterator_traits<
+      typename std::iterator_traits<RandomAccessIteratorIterator>::value_type
+      ::first_type>::difference_type
  prepare_unguarded(RandomAccessIteratorIterator seqs_begin,
                    RandomAccessIteratorIterator seqs_end, Comparator comp,
                    int& min_sequence, bool stable)
  {
    _GLIBCXX_CALL(seqs_end - seqs_begin)

-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+        ::value_type::first_type
      RandomAccessIterator1;
    typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
      value_type;
-    typedef typename std::iterator_traits<RandomAccessIterator1>::difference_type
+    typedef typename std::iterator_traits<RandomAccessIterator1>
+      ::difference_type
      difference_type;

    if ((*seqs_begin).first == (*seqs_begin).second)
@@ -317,7 +329,8 @@ namespace __gnu_parallel

    for (; s < (seqs_end - seqs_begin); s++)
      {
-	RandomAccessIterator1 split = std::lower_bound(seqs_begin[s].first, seqs_begin[s].second, min, comp);
+        RandomAccessIterator1 split = std::lower_bound(
+            seqs_begin[s].first, seqs_begin[s].second, min, comp);
        overhang_size += seqs_begin[s].second - split;
      }

@@ -325,23 +338,27 @@ namespace __gnu_parallel
    return overhang_size;
  }

-  /** Prepare a set of sequences to be merged with a (end) guard (sentinel)
+/** Prepare a set of sequences to be merged with a (end) guard (sentinel)
 *  @param seqs_begin
 *  @param seqs_end
 *  @param comp */
-  template<typename RandomAccessIteratorIterator, typename Comparator>
-  typename std::iterator_traits<typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type>::difference_type
+template<typename RandomAccessIteratorIterator, typename Comparator>
+  typename std::iterator_traits<typename std::iterator_traits<
+      RandomAccessIteratorIterator>::value_type::first_type>::difference_type
  prepare_unguarded_sentinel(RandomAccessIteratorIterator seqs_begin,
                            RandomAccessIteratorIterator seqs_end,
                            Comparator comp)
  {
    _GLIBCXX_CALL(seqs_end - seqs_begin)

-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
      RandomAccessIterator1;
-    typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
+    typedef typename std::iterator_traits<RandomAccessIterator1>
+      ::value_type
      value_type;
-    typedef typename std::iterator_traits<RandomAccessIterator1>::difference_type
+    typedef typename std::iterator_traits<RandomAccessIterator1>
+      ::difference_type
      difference_type;

    // Last element in sequence.
@@ -362,8 +379,8 @@ namespace __gnu_parallel
    difference_type overhang_size = 0;
    for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; s++)
      {
-	RandomAccessIterator1 split = std::lower_bound((*s).first, (*s).second,
-						       *max, comp);
+        RandomAccessIterator1 split =
+            std::lower_bound((*s).first, (*s).second, *max, comp);
        overhang_size += (*s).second - split;

        // Set sentinel.
@@ -374,7 +391,7 @@ namespace __gnu_parallel
    return overhang_size;
  }

-  /** @brief Highly efficient 3-way merging procedure.
+/** @brief Highly efficient 3-way merging procedure.
 *  @param seqs_begin Begin iterator of iterator pair input sequence.
 *  @param seqs_end End iterator of iterator pair input sequence.
 *  @param target Begin iterator out output sequence.
@@ -382,15 +399,25 @@ namespace __gnu_parallel
 *  @param length Maximum length to merge.
 *  @param stable Unused, stable anyway.
 *  @return End iterator of output sequence. */
-  template<template<typename RAI, typename C> class iterator, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    template<typename RAI, typename C> class iterator,
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
  RandomAccessIterator3
-  multiway_merge_3_variant(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_3_variant(
+      RandomAccessIteratorIterator seqs_begin,
+      RandomAccessIteratorIterator seqs_end,
+      RandomAccessIterator3 target,
+      Comparator comp, _DifferenceTp length, bool stable)
  {
    _GLIBCXX_CALL(length);

    typedef _DifferenceTp difference_type;

-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
      RandomAccessIterator1;
    typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
      value_type;
@@ -456,14 +483,23 @@ namespace __gnu_parallel
    return target;
  }

-  template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
  RandomAccessIterator3
-  multiway_merge_3_combined(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_3_combined(RandomAccessIteratorIterator seqs_begin,
+                            RandomAccessIteratorIterator seqs_end,
+                            RandomAccessIterator3 target,
+                            Comparator comp,
+                            _DifferenceTp length, bool stable)
  {
    _GLIBCXX_CALL(length);

    typedef _DifferenceTp difference_type;
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
      RandomAccessIterator1;
    typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
      value_type;
@@ -472,7 +508,8 @@ namespace __gnu_parallel
    RandomAccessIterator3 target_end;

    // Stable anyway.
-    difference_type overhang = prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true);
+    difference_type overhang =
+        prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true);

    difference_type total_length = 0;
    for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; ++s)
@@ -480,7 +517,8 @@ namespace __gnu_parallel

    if (overhang != -1)
      {
-	difference_type unguarded_length = std::min(length, total_length - overhang);
+        difference_type unguarded_length =
+            std::min(length, total_length - overhang);
        target_end = multiway_merge_3_variant<unguarded_iterator>
          (seqs_begin, seqs_end, target, comp, unguarded_length, stable);
        overhang = length - unguarded_length;
@@ -527,7 +565,7 @@ namespace __gnu_parallel
    return target_end;
  }

-  /** @brief Highly efficient 4-way merging procedure.
+/** @brief Highly efficient 4-way merging procedure.
 *  @param seqs_begin Begin iterator of iterator pair input sequence.
 *  @param seqs_end End iterator of iterator pair input sequence.
 *  @param target Begin iterator out output sequence.
@@ -535,14 +573,23 @@ namespace __gnu_parallel
 *  @param length Maximum length to merge.
 *  @param stable Unused, stable anyway.
 *  @return End iterator of output sequence. */
-  template<template<typename RAI, typename C> class iterator, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    template<typename RAI, typename C> class iterator,
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
  RandomAccessIterator3
-  multiway_merge_4_variant(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_4_variant(RandomAccessIteratorIterator seqs_begin,
+                           RandomAccessIteratorIterator seqs_end,
+                           RandomAccessIterator3 target,
+                           Comparator comp, _DifferenceTp length, bool stable)
  {
    _GLIBCXX_CALL(length);
    typedef _DifferenceTp difference_type;

-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
      RandomAccessIterator1;
    typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
      value_type;
@@ -633,14 +680,23 @@ namespace __gnu_parallel
    return target;
  }

-  template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
  RandomAccessIterator3
-  multiway_merge_4_combined(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_4_combined(RandomAccessIteratorIterator seqs_begin,
+                            RandomAccessIteratorIterator seqs_end,
+                            RandomAccessIterator3 target,
+                            Comparator comp,
+                            _DifferenceTp length, bool stable)
  {
    _GLIBCXX_CALL(length);
    typedef _DifferenceTp difference_type;

-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
      RandomAccessIterator1;
    typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
      value_type;
@@ -649,7 +705,8 @@ namespace __gnu_parallel
    RandomAccessIterator3 target_end;

    // Stable anyway.
-    difference_type overhang = prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true);
+    difference_type overhang =
+        prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true);

    difference_type total_length = 0;
    for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; ++s)
@@ -657,7 +714,8 @@ namespace __gnu_parallel

    if (overhang != -1)
      {
-	difference_type unguarded_length = std::min(length, total_length - overhang);
+        difference_type unguarded_length =
+            std::min(length, total_length - overhang);
        target_end = multiway_merge_4_variant<unguarded_iterator>
          (seqs_begin, seqs_end, target, comp, unguarded_length, stable);
        overhang = length - unguarded_length;
@@ -674,10 +732,13 @@ namespace __gnu_parallel
    _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target_end, comp));
 #endif

-    std::vector<std::pair<RandomAccessIterator1, RandomAccessIterator1> > one_missing(seqs_begin, seqs_end);
+    std::vector<std::pair<RandomAccessIterator1, RandomAccessIterator1> >
+        one_missing(seqs_begin, seqs_end);
    one_missing.erase(one_missing.begin() + min_seq);	//remove

-    target_end = multiway_merge_3_variant<guarded_iterator>(one_missing.begin(), one_missing.end(), target_end, comp, overhang, stable);
+    target_end = multiway_merge_3_variant<guarded_iterator>(
+        one_missing.begin(), one_missing.end(),
+        target_end, comp, overhang, stable);

    // Insert back again.
    one_missing.insert(one_missing.begin() + min_seq, seqs_begin[min_seq]);
@@ -692,7 +753,7 @@ namespace __gnu_parallel
    return target_end;
  }

-  /** @brief Basic multi-way merging procedure.
+/** @brief Basic multi-way merging procedure.
 *
 *  The head elements are kept in a sorted array, new heads are
 *  inserted linearly.
@@ -704,14 +765,22 @@ namespace __gnu_parallel
 *  @param stable Stable merging incurs a performance penalty.
 *  @return End iterator of output sequence.
 */
-  template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
  RandomAccessIterator3
-  multiway_merge_bubble(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_bubble(RandomAccessIteratorIterator seqs_begin,
+                        RandomAccessIteratorIterator seqs_end,
+                        RandomAccessIterator3 target,
+                        Comparator comp, _DifferenceTp length, bool stable)
  {
    _GLIBCXX_CALL(length)

    typedef _DifferenceTp difference_type;
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
      RandomAccessIterator1;
    typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
      value_type;
@@ -719,7 +788,8 @@ namespace __gnu_parallel
    // Num remaining pieces.
    int k = static_cast<int>(seqs_end - seqs_begin), nrp;

-    value_type* pl = static_cast<value_type*>(::operator new(sizeof(value_type) * k));
+    value_type* pl = static_cast<value_type*>(
+      ::operator new(sizeof(value_type) * k));
    int* source = new int[k];
    difference_type total_length = 0;

@@ -818,7 +888,8 @@ namespace __gnu_parallel
            // Sink down.
            j = 1;
            while ((j < nrp) && (comp(pl[j], pl[j - 1]) ||
-				 (!comp(pl[j - 1], pl[j]) && (source[j] < source[j - 1]))))
+                                (!comp(pl[j - 1], pl[j])
+                                    && (source[j] < source[j - 1]))))
              {
                std::swap(pl[j - 1], pl[j]);
                std::swap(source[j - 1], source[j]);
@@ -869,7 +940,7 @@ namespace __gnu_parallel
    return target;
  }

-  /** @brief Multi-way merging procedure for a high branching factor,
+/** @brief Multi-way merging procedure for a high branching factor,
 * guarded case.
 *
 *  The head elements are kept in a loser tree.
@@ -881,14 +952,24 @@ namespace __gnu_parallel
 *   @param stable Stable merging incurs a performance penalty.
 *  @return End iterator of output sequence.
 */
-  template<typename LT, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename LT,
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
  RandomAccessIterator3
-  multiway_merge_loser_tree(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_loser_tree(RandomAccessIteratorIterator seqs_begin,
+                            RandomAccessIteratorIterator seqs_end,
+                            RandomAccessIterator3 target,
+                            Comparator comp,
+                            _DifferenceTp length, bool stable)
  {
    _GLIBCXX_CALL(length)

    typedef _DifferenceTp difference_type;
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
      RandomAccessIterator1;
    typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
      value_type;
@@ -978,7 +1059,7 @@ namespace __gnu_parallel
    return target;
  }

-  /** @brief Multi-way merging procedure for a high branching factor,
+/** @brief Multi-way merging procedure for a high branching factor,
 * unguarded case.
 *
 *  The head elements are kept in a loser tree.
@@ -991,14 +1072,23 @@ namespace __gnu_parallel
 *  @return End iterator of output sequence.
 *  @pre No input will run out of elements during the merge.
 */
-  template<typename LT, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename LT,
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp, typename Comparator>
  RandomAccessIterator3
-  multiway_merge_loser_tree_unguarded(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_loser_tree_unguarded(RandomAccessIteratorIterator seqs_begin,
+                                      RandomAccessIteratorIterator seqs_end,
+                                      RandomAccessIterator3 target,
+                                      Comparator comp,
+                                      _DifferenceTp length, bool stable)
  {
    _GLIBCXX_CALL(length)
    typedef _DifferenceTp difference_type;

-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
      RandomAccessIterator1;
    typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
      value_type;
@@ -1045,13 +1135,16 @@ namespace __gnu_parallel
            source = lt.get_min_source();

 #if _GLIBCXX_ASSERTIONS
-	    _GLIBCXX_PARALLEL_ASSERT(i == 0 || !comp(*(seqs_begin[source].first), *(target - 1)));
+            _GLIBCXX_PARALLEL_ASSERT(i == 0
+                || !comp(*(seqs_begin[source].first), *(target - 1)));
 #endif

            *(target++) = *(seqs_begin[source].first++);

 #if _GLIBCXX_ASSERTIONS
-	    _GLIBCXX_PARALLEL_ASSERT((seqs_begin[source].first != seqs_begin[source].second) || (i == length - 1));
+            _GLIBCXX_PARALLEL_ASSERT(
+                (seqs_begin[source].first != seqs_begin[source].second)
+                || (i == length - 1));
            i++;
 #endif
            // Feed.
@@ -1071,15 +1164,19 @@ namespace __gnu_parallel
 #if _GLIBCXX_ASSERTIONS
            if (i > 0 && comp(*(seqs_begin[source].first), *(target - 1)))
              printf("         %i %i %i\n", length, i, source);
-	    _GLIBCXX_PARALLEL_ASSERT(i == 0 || !comp(*(seqs_begin[source].first), *(target - 1)));
+            _GLIBCXX_PARALLEL_ASSERT(i == 0
+                || !comp(*(seqs_begin[source].first), *(target - 1)));
 #endif

            *(target++) = *(seqs_begin[source].first++);

 #if _GLIBCXX_ASSERTIONS
-	    if (!((seqs_begin[source].first != seqs_begin[source].second) || (i >= length - 1)))
+            if (!((seqs_begin[source].first != seqs_begin[source].second)
+                || (i >= length - 1)))
              printf("         %i %i %i\n", length, i, source);
-	    _GLIBCXX_PARALLEL_ASSERT((seqs_begin[source].first != seqs_begin[source].second) || (i >= length - 1));
+            _GLIBCXX_PARALLEL_ASSERT(
+                (seqs_begin[source].first != seqs_begin[source].second)
+                || (i >= length - 1));
            i++;
 #endif
            // Feed.
@@ -1091,15 +1188,24 @@ namespace __gnu_parallel
    return target;
  }

-  template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
  RandomAccessIterator3
-  multiway_merge_loser_tree_combined(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_loser_tree_combined(RandomAccessIteratorIterator seqs_begin,
+                                     RandomAccessIteratorIterator seqs_end,
+                                     RandomAccessIterator3 target,
+                                     Comparator comp,
+                                     _DifferenceTp length, bool stable)
  {
    _GLIBCXX_CALL(length)

    typedef _DifferenceTp difference_type;

-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
      RandomAccessIterator1;
    typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
      value_type;
@@ -1115,7 +1221,8 @@ namespace __gnu_parallel

    if (overhang != -1)
      {
-	difference_type unguarded_length = std::min(length, total_length - overhang);
+        difference_type unguarded_length =
+            std::min(length, total_length - overhang);
        target_end = multiway_merge_loser_tree_unguarded
          <typename loser_tree_unguarded_traits<value_type, Comparator>::LT>
          (seqs_begin, seqs_end, target, comp, unguarded_length, stable);
@@ -1145,23 +1252,31 @@ namespace __gnu_parallel
    return target_end;
  }

-  template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
  RandomAccessIterator3
-  multiway_merge_loser_tree_sentinel(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_loser_tree_sentinel(RandomAccessIteratorIterator seqs_begin,
+                                     RandomAccessIteratorIterator seqs_end,
+                                      RandomAccessIterator3 target,
+                                      Comparator comp,
+                                      _DifferenceTp length, bool stable)
  {
    _GLIBCXX_CALL(length)

    typedef _DifferenceTp difference_type;
    typedef std::iterator_traits<RandomAccessIteratorIterator> traits_type;
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
      RandomAccessIterator1;
    typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
      value_type;
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
-      RandomAccessIterator1;

    RandomAccessIterator3 target_end;
-    difference_type overhang = prepare_unguarded_sentinel(seqs_begin, seqs_end, comp);
+    difference_type overhang =
+        prepare_unguarded_sentinel(seqs_begin, seqs_end, comp);

    difference_type total_length = 0;
    for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; s++)
@@ -1172,7 +1287,8 @@ namespace __gnu_parallel
        (*s).second++;
      }

-    difference_type unguarded_length = std::min(length, total_length - overhang);
+    difference_type unguarded_length =
+        std::min(length, total_length - overhang);
    target_end = multiway_merge_loser_tree_unguarded
      <typename loser_tree_unguarded_traits<value_type, Comparator>::LT>
      (seqs_begin, seqs_end, target, comp, unguarded_length, stable);
@@ -1184,12 +1300,15 @@ namespace __gnu_parallel
 #endif

    // Copy rest stable.
-    for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end && overhang > 0; s++)
+    for (RandomAccessIteratorIterator s = seqs_begin;
+         s != seqs_end && overhang > 0; s++)
      {
        // Restore.
        (*s).second--;
-	difference_type local_length = std::min((difference_type)overhang, (difference_type)LENGTH(*s));
-	target_end = std::copy((*s).first, (*s).first + local_length, target_end);
+        difference_type local_length =
+            std::min<difference_type>(overhang, LENGTH(*s));
+        target_end = std::copy((*s).first, (*s).first + local_length,
+                               target_end);
        (*s).first += local_length;
        overhang -= local_length;
      }
@@ -1203,7 +1322,7 @@ namespace __gnu_parallel
    return target_end;
  }

-  /** @brief Sequential multi-way merging switch.
+/** @brief Sequential multi-way merging switch.
 *
 *  The decision if based on the branching factor and runtime settings.
 *  @param seqs_begin Begin iterator of iterator pair input sequence.
@@ -1214,14 +1333,24 @@ namespace __gnu_parallel
 *  @param stable Stable merging incurs a performance penalty.
 *  @param sentinel The sequences have a sentinel element.
 *  @return End iterator of output sequence. */
-  template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
  RandomAccessIterator3
-  multiway_merge(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable, bool sentinel, sequential_tag)
+  multiway_merge(RandomAccessIteratorIterator seqs_begin,
+                 RandomAccessIteratorIterator seqs_end,
+                 RandomAccessIterator3 target,
+                 Comparator comp, _DifferenceTp length,
+                 bool stable, bool sentinel,
+                 sequential_tag)
  {
    _GLIBCXX_CALL(length)

    typedef _DifferenceTp difference_type;
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
      RandomAccessIterator1;
    typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
      value_type;
@@ -1234,7 +1363,8 @@ namespace __gnu_parallel
    RandomAccessIterator3 return_target = target;
    int k = static_cast<int>(seqs_end - seqs_begin);

-    Settings::MultiwayMergeAlgorithm mwma = Settings::multiway_merge_algorithm;
+    Settings::MultiwayMergeAlgorithm mwma =
+        Settings::multiway_merge_algorithm;

    if (!sentinel && mwma == Settings::LOSER_TREE_SENTINEL)
      mwma = Settings::LOSER_TREE_COMBINED;
@@ -1244,23 +1374,40 @@ namespace __gnu_parallel
      case 0:
        break;
      case 1:
-	return_target = std::copy(seqs_begin[0].first, seqs_begin[0].first + length, target);
+        return_target = std::copy(seqs_begin[0].first,
+                                  seqs_begin[0].first + length,
+                                  target);
        seqs_begin[0].first += length;
        break;
      case 2:
-	return_target = merge_advance(seqs_begin[0].first, seqs_begin[0].second, seqs_begin[1].first, seqs_begin[1].second, target, length, comp);
+        return_target = merge_advance(seqs_begin[0].first,
+                                      seqs_begin[0].second,
+                                      seqs_begin[1].first,
+                                      seqs_begin[1].second,
+                                      target, length, comp);
        break;
      case 3:
        switch (mwma)
          {
          case Settings::LOSER_TREE_COMBINED:
-	    return_target = multiway_merge_3_combined(seqs_begin, seqs_end, target, comp, length, stable);
+            return_target = multiway_merge_3_combined(seqs_begin,
+                seqs_end,
+                target,
+                comp, length, stable);
            break;
          case Settings::LOSER_TREE_SENTINEL:
-	    return_target = multiway_merge_3_variant<unguarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable);
+            return_target = multiway_merge_3_variant<unguarded_iterator>(
+                seqs_begin,
+                seqs_end,
+                target,
+                comp, length, stable);
            break;
          default:
-	    return_target = multiway_merge_3_variant<guarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable);
+            return_target = multiway_merge_3_variant<guarded_iterator>(
+                seqs_begin,
+                seqs_end,
+                target,
+                comp, length, stable);
            break;
          }
        break;
@@ -1268,13 +1415,25 @@ namespace __gnu_parallel
        switch (mwma)
          {
          case Settings::LOSER_TREE_COMBINED:
-	    return_target = multiway_merge_4_combined(seqs_begin, seqs_end, target, comp, length, stable);
+            return_target = multiway_merge_4_combined(
+                seqs_begin,
+                seqs_end,
+                target,
+                comp, length, stable);
            break;
          case Settings::LOSER_TREE_SENTINEL:
-	    return_target = multiway_merge_4_variant<unguarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable);
+            return_target = multiway_merge_4_variant<unguarded_iterator>(
+                seqs_begin,
+                seqs_end,
+                target,
+                comp, length, stable);
            break;
          default:
-	    return_target = multiway_merge_4_variant<guarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable);
+            return_target = multiway_merge_4_variant<guarded_iterator>(
+                seqs_begin,
+                seqs_end,
+                target,
+                comp, length, stable);
            break;
          }
        break;
@@ -1283,26 +1442,48 @@ namespace __gnu_parallel
          switch (mwma)
            {
            case Settings::BUBBLE:
-	      return_target = multiway_merge_bubble(seqs_begin, seqs_end, target, comp, length, stable);
+              return_target = multiway_merge_bubble(
+                  seqs_begin,
+                  seqs_end,
+                  target,
+                  comp, length, stable);
              break;
 #if _GLIBCXX_LOSER_TREE_EXPLICIT
            case Settings::LOSER_TREE_EXPLICIT:
-	      return_target = multiway_merge_loser_tree<LoserTreeExplicit<value_type, Comparator> >(seqs_begin, seqs_end, target, comp, length, stable);
+              return_target = multiway_merge_loser_tree<
+                    LoserTreeExplicit<value_type, Comparator> >(
+                  seqs_begin,
+                  seqs_end,
+                  target,
+                  comp, length, stable);
              break;
 #endif
 #if _GLIBCXX_LOSER_TREE
            case Settings::LOSER_TREE:
-	      return_target = multiway_merge_loser_tree<LoserTree<value_type, Comparator> >(seqs_begin, seqs_end, target, comp, length, stable);
+              return_target = multiway_merge_loser_tree<
+                    LoserTree<value_type, Comparator> >(
+                  seqs_begin,
+                  seqs_end,
+                  target,
+                  comp, length, stable);
              break;
 #endif
 #if _GLIBCXX_LOSER_TREE_COMBINED
            case Settings::LOSER_TREE_COMBINED:
-	      return_target = multiway_merge_loser_tree_combined(seqs_begin, seqs_end, target, comp, length, stable);
+              return_target = multiway_merge_loser_tree_combined(
+                  seqs_begin,
+                  seqs_end,
+                  target,
+                  comp, length, stable);
              break;
 #endif
 #if _GLIBCXX_LOSER_TREE_SENTINEL
            case Settings::LOSER_TREE_SENTINEL:
-	      return_target = multiway_merge_loser_tree_sentinel(seqs_begin, seqs_end, target, comp, length, stable);
+              return_target = multiway_merge_loser_tree_sentinel(
+                  seqs_begin,
+                  seqs_end,
+                  target,
+                  comp, length, stable);
              break;
 #endif
            default:
@@ -1319,7 +1500,7 @@ namespace __gnu_parallel
    return return_target;
  }

-  /** @brief Parallel multi-way merge routine.
+/** @brief Parallel multi-way merge routine.
 *
 *  The decision if based on the branching factor and runtime settings.
 *  @param seqs_begin Begin iterator of iterator pair input sequence.
@@ -1331,28 +1512,33 @@ namespace __gnu_parallel
 *  @param sentinel Ignored.
 *  @return End iterator of output sequence.
 */
-  template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
  RandomAccessIterator3
-  parallel_multiway_merge(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable, bool sentinel)
+  parallel_multiway_merge(RandomAccessIteratorIterator seqs_begin,
+                          RandomAccessIteratorIterator seqs_end,
+                           RandomAccessIterator3 target,
+                           Comparator comp,
+                           _DifferenceTp length, bool stable, bool sentinel)
    {
      _GLIBCXX_CALL(length)

      typedef _DifferenceTp difference_type;
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+      typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+        ::value_type::first_type
        RandomAccessIterator1;
      typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
        value_type;

-#if _GLIBCXX_ASSERTIONS
-    for (RandomAccessIteratorIterator rii = seqs_begin; rii != seqs_end; rii++)
-      _GLIBCXX_PARALLEL_ASSERT(is_sorted((*rii).first, (*rii).second, comp));
-#endif
-
      // k sequences.
      int k = static_cast<int>(seqs_end - seqs_begin);

      difference_type total_length = 0;
-    for (RandomAccessIteratorIterator raii = seqs_begin; raii != seqs_end; raii++)
+      for (RandomAccessIteratorIterator raii = seqs_begin;
+           raii != seqs_end; raii++)
        total_length += LENGTH(*raii);

      _GLIBCXX_CALL(total_length)
@@ -1360,32 +1546,50 @@ namespace __gnu_parallel
      if (total_length == 0 || k == 0)
        return target;

-    thread_index_t num_threads = static_cast<thread_index_t>(std::min(static_cast<difference_type>(get_max_threads()), total_length));
-
      bool tight = (total_length == length);

+      std::vector<std::pair<difference_type, difference_type> >* pieces;
+
+      thread_index_t num_threads = static_cast<thread_index_t>(
+          std::min<difference_type>(get_max_threads(), total_length));
+
+#     pragma omp parallel num_threads (num_threads)
+        {
+#         pragma omp single
+            {
+              num_threads = omp_get_num_threads();
              // Thread t will have to merge pieces[iam][0..k - 1]
-    std::vector<std::pair<difference_type, difference_type> >* pieces = new std::vector<std::pair<difference_type, difference_type> >[num_threads];
+              pieces = new std::vector<
+                  std::pair<difference_type, difference_type> >[num_threads];
              for (int s = 0; s < num_threads; s++)
                pieces[s].resize(k);

-    difference_type num_samples = Settings::merge_oversampling * num_threads;
+              difference_type num_samples =
+                  Settings::merge_oversampling * num_threads;

              if (Settings::multiway_merge_splitting == Settings::SAMPLING)
                {
-	value_type* samples = static_cast<value_type*>(::operator new(sizeof(value_type) * k * num_samples));
+                  value_type* samples = static_cast<value_type*>(
+                    ::operator new(sizeof(value_type) * k * num_samples));
                  // Sample.
                  for (int s = 0; s < k; s++)
                    for (int i = 0; (difference_type)i < num_samples; i++)
                      {
-	      difference_type sample_index = static_cast<difference_type>(LENGTH(seqs_begin[s]) * (double(i + 1) / (num_samples + 1)) * (double(length) / total_length));
-	      samples[s * num_samples + i] = seqs_begin[s].first[sample_index];
+                        difference_type sample_index =
+                            static_cast<difference_type>(
+                                LENGTH(seqs_begin[s]) * (double(i + 1) /
+                                (num_samples + 1)) * (double(length)
+                                / total_length));
+                        samples[s * num_samples + i] =
+                            seqs_begin[s].first[sample_index];
                      }

                  if (stable)
-	  __gnu_sequential::stable_sort(samples, samples + (num_samples * k), comp);
+                    __gnu_sequential::stable_sort(
+                      samples, samples + (num_samples * k), comp);
                  else
-	  __gnu_sequential::sort(samples, samples + (num_samples * k), comp);
+                    __gnu_sequential::sort(
+                      samples, samples + (num_samples * k), comp);

                  for (int slab = 0; slab < num_threads; slab++)
                    // For each slab / processor.
@@ -1393,34 +1597,51 @@ namespace __gnu_parallel
                      {
                        // For each sequence.
                        if (slab > 0)
-		pieces[slab][seq].first = std::upper_bound(seqs_begin[seq].first, seqs_begin[seq].second, samples[num_samples * k * slab / num_threads], comp) - seqs_begin[seq].first;
+                          pieces[slab][seq].first =
+                              std::upper_bound(
+                                seqs_begin[seq].first,
+                                seqs_begin[seq].second,
+                                samples[num_samples * k * slab / num_threads],
+                                  comp)
+                              - seqs_begin[seq].first;
                        else
                          {
                            // Absolute beginning.
                            pieces[slab][seq].first = 0;
                          }
                        if ((slab + 1) < num_threads)
-		pieces[slab][seq].second = std::upper_bound(seqs_begin[seq].first, seqs_begin[seq].second, samples[num_samples * k * (slab + 1) / num_threads], comp) - seqs_begin[seq].first;
+                          pieces[slab][seq].second =
+                              std::upper_bound(
+                                  seqs_begin[seq].first,
+                                  seqs_begin[seq].second,
+                                  samples[num_samples * k * (slab + 1) /
+                                      num_threads], comp)
+                              - seqs_begin[seq].first;
                        else
-		pieces[slab][seq].second = LENGTH(seqs_begin[seq]);	//absolute ending
+                        pieces[slab][seq].second = LENGTH(seqs_begin[seq]);
                      }
                    delete[] samples;
                }
              else
                {
                  // (Settings::multiway_merge_splitting == Settings::EXACT).
-	std::vector<RandomAccessIterator1>* offsets = new std::vector<RandomAccessIterator1>[num_threads];
-	std::vector<std::pair<RandomAccessIterator1, RandomAccessIterator1> > se(k);
+                  std::vector<RandomAccessIterator1>* offsets =
+                      new std::vector<RandomAccessIterator1>[num_threads];
+                  std::vector<
+                      std::pair<RandomAccessIterator1, RandomAccessIterator1>
+                      > se(k);

                  copy(seqs_begin, seqs_end, se.begin());

-	difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 1)));
+                  difference_type* borders =
+                      new difference_type[num_threads + 1];
                  equally_split(length, num_threads, borders);

                  for (int s = 0; s < (num_threads - 1); s++)
                    {
                      offsets[s].resize(k);
-	    multiseq_partition(se.begin(), se.end(), borders[s + 1],
+                      multiseq_partition(
+                          se.begin(), se.end(), borders[s + 1],
                          offsets[s].begin(), comp);

                      // Last one also needed and available.
@@ -1446,21 +1667,23 @@ namespace __gnu_parallel
                              pieces[slab][seq].first = 0;
                            }
                          else
-		  pieces[slab][seq].first = pieces[slab - 1][seq].second;
+                            pieces[slab][seq].first =
+                                pieces[slab - 1][seq].second;
                          if (!tight || slab < (num_threads - 1))
-		  pieces[slab][seq].second = offsets[slab][seq] - seqs_begin[seq].first;
+                            pieces[slab][seq].second =
+                                offsets[slab][seq] - seqs_begin[seq].first;
                          else
                            {
                              // slab == num_threads - 1
-		    pieces[slab][seq].second = LENGTH(seqs_begin[seq]);
+                              pieces[slab][seq].second =
+                                  LENGTH(seqs_begin[seq]);
                            }
                        }
                    }
                  delete[] offsets;
                }
+            } //single

-#	pragma omp parallel num_threads(num_threads)
-    {
          thread_index_t iam = omp_get_thread_num();

          difference_type target_position = 0;
@@ -1470,16 +1693,21 @@ namespace __gnu_parallel

          if (k > 2)
            {
-	  std::pair<RandomAccessIterator1, RandomAccessIterator1>* chunks = new std::pair<RandomAccessIterator1, RandomAccessIterator1>[k];
+              std::pair<RandomAccessIterator1, RandomAccessIterator1>* chunks
+                = new
+                  std::pair<RandomAccessIterator1, RandomAccessIterator1>[k];

              difference_type local_length = 0;
              for (int s = 0; s < k; s++)
                {
-	      chunks[s] = std::make_pair(seqs_begin[s].first + pieces[iam][s].first, seqs_begin[s].first + pieces[iam][s].second);
+                  chunks[s] = std::make_pair(
+                      seqs_begin[s].first + pieces[iam][s].first,
+                      seqs_begin[s].first + pieces[iam][s].second);
                  local_length += LENGTH(chunks[s]);
                }

-	  multiway_merge(chunks, chunks + k, target + target_position, comp,
+              multiway_merge(
+                    chunks, chunks + k, target + target_position, comp,
                    std::min(local_length, length - target_position),
                    stable, false, sequential_tag());

@@ -1487,16 +1715,19 @@ namespace __gnu_parallel
            }
          else if (k == 2)
            {
-	  RandomAccessIterator1 begin0 = seqs_begin[0].first + pieces[iam][0].first, begin1 = seqs_begin[1].first + pieces[iam][1].first;
+              RandomAccessIterator1
+                  begin0 = seqs_begin[0].first + pieces[iam][0].first,
+                  begin1 = seqs_begin[1].first + pieces[iam][1].first;
              merge_advance(begin0,
                    seqs_begin[0].first + pieces[iam][0].second,
                    begin1,
                    seqs_begin[1].first + pieces[iam][1].second,
                    target + target_position,
-			(pieces[iam][0].second - pieces[iam][0].first) + (pieces[iam][1].second - pieces[iam][1].first),
+                    (pieces[iam][0].second - pieces[iam][0].first) +
+                        (pieces[iam][1].second - pieces[iam][1].first),
                    comp);
            }
-    }
+        } //parallel

 #if _GLIBCXX_ASSERTIONS
      _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target + length, comp));
@@ -1511,7 +1742,7 @@ namespace __gnu_parallel
      return target + length;
    }

-  /** 
+/**
 *  @brief Multi-way merging front-end.
 *  @param seqs_begin Begin iterator of iterator pair input sequence.
 *  @param seqs_end End iterator of iterator pair input sequence.
@@ -1521,7 +1752,11 @@ namespace __gnu_parallel
 *  @param stable Stable merging incurs a performance penalty.
 *  @return End iterator of output sequence.
 */
-  template<typename RandomAccessIteratorPairIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename RandomAccessIteratorPairIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
  RandomAccessIterator3
  multiway_merge(RandomAccessIteratorPairIterator seqs_begin,
                RandomAccessIteratorPairIterator seqs_end,
@@ -1535,15 +1770,21 @@ namespace __gnu_parallel
      return target;

    RandomAccessIterator3 target_end;
-    if (_GLIBCXX_PARALLEL_CONDITION(((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k) && ((sequence_index_t)length >= Settings::multiway_merge_minimal_n)))
-      target_end = parallel_multiway_merge(seqs_begin, seqs_end, target, comp, (difference_type)length, stable, false);
+    if (_GLIBCXX_PARALLEL_CONDITION(
+        ((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k)
+        && ((sequence_index_t)length >= Settings::multiway_merge_minimal_n)))
+      target_end = parallel_multiway_merge(
+          seqs_begin, seqs_end,
+          target, comp, static_cast<difference_type>(length), stable, false);
    else
-      target_end = multiway_merge(seqs_begin, seqs_end, target, comp, length, stable, false, sequential_tag());
+      target_end = multiway_merge(
+          seqs_begin, seqs_end,
+          target, comp, length, stable, false, sequential_tag());

    return target_end;
  }

-  /** @brief Multi-way merging front-end.
+/** @brief Multi-way merging front-end.
 *  @param seqs_begin Begin iterator of iterator pair input sequence.
 *  @param seqs_end End iterator of iterator pair input sequence.
 *  @param target Begin iterator out output sequence.
@@ -1554,7 +1795,11 @@ namespace __gnu_parallel
 *  @pre For each @c i, @c seqs_begin[i].second must be the end
 *  marker of the sequence, but also reference the one more sentinel
 *  element. */
-  template<typename RandomAccessIteratorPairIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename RandomAccessIteratorPairIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
  RandomAccessIterator3
  multiway_merge_sentinel(RandomAccessIteratorPairIterator seqs_begin,
                          RandomAccessIteratorPairIterator seqs_end,
@@ -1570,10 +1815,16 @@ namespace __gnu_parallel

    _GLIBCXX_CALL(seqs_end - seqs_begin)

-    if (_GLIBCXX_PARALLEL_CONDITION(((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k) && ((sequence_index_t)length >= Settings::multiway_merge_minimal_n)))
-      return parallel_multiway_merge(seqs_begin, seqs_end, target, comp, (typename std::iterator_traits<RandomAccessIterator3>::difference_type)length, stable, true);
+    if (_GLIBCXX_PARALLEL_CONDITION(
+        ((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k)
+        && ((sequence_index_t)length >= Settings::multiway_merge_minimal_n)))
+      return parallel_multiway_merge(
+          seqs_begin, seqs_end,
+          target, comp, static_cast<difference_type>(length), stable, true);
    else
-      return multiway_merge(seqs_begin, seqs_end, target, comp, length, stable, true, sequential_tag());
+      return multiway_merge(
+          seqs_begin, seqs_end,
+          target, comp, length, stable, true, sequential_tag());
  }
 }


--- a/libstdc++-v3/include/parallel/multiway_mergesort.h
+++ b/libstdc++-v3/include/parallel/multiway_mergesort.h
@@ -48,8 +48,8 @@
 namespace __gnu_parallel
 {

-  /** @brief Subsequence description. */
-  template<typename _DifferenceTp>
+/** @brief Subsequence description. */
+template<typename _DifferenceTp>
  struct Piece
  {
    typedef _DifferenceTp difference_type;
@@ -61,16 +61,19 @@ namespace __gnu_parallel
    difference_type end;
  };

-  /** @brief Data accessed by all threads.
+/** @brief Data accessed by all threads.
  *
  *  PMWMS = parallel multiway mergesort */
-  template<typename RandomAccessIterator>
+template<typename RandomAccessIterator>
  struct PMWMSSortingData
  {
    typedef std::iterator_traits<RandomAccessIterator> traits_type;
    typedef typename traits_type::value_type value_type;
    typedef typename traits_type::difference_type difference_type;

+    /** @brief Number of threads involved. */
+    thread_index_t num_threads;
+
    /** @brief Input begin. */
    RandomAccessIterator source;

@@ -105,62 +108,55 @@ namespace __gnu_parallel

    /** @brief Pieces of data to merge @c [thread][sequence] */
    std::vector<Piece<difference_type> >* pieces;
-  };

-  /** @brief Thread local data for PMWMS. */
-  template<typename RandomAccessIterator>
-  struct PMWMSSorterPU
-  {
-    /** @brief Total number of thread involved. */
-    thread_index_t num_threads;
-    /** @brief Number of owning thread. */
-    thread_index_t iam;
    /** @brief Stable sorting desired. */
    bool stable;
-    /** @brief Pointer to global data. */
-    PMWMSSortingData<RandomAccessIterator>* sd;
-  };
+};

-  /** 
+/**
  *  @brief Select samples from a sequence.
-   *  @param d Pointer to thread-local data. Result will be placed in
-   *  @c d->ds->samples.
+  *  @param sd Pointer to algorithm data. Result will be placed in
+  *  @c sd->samples.
  *  @param num_samples Number of samples to select.
  */
-  template<typename RandomAccessIterator, typename _DifferenceTp>
+template<typename RandomAccessIterator, typename _DifferenceTp>
  inline void 
-  determine_samples(PMWMSSorterPU<RandomAccessIterator>* d, 
+  determine_samples(PMWMSSortingData<RandomAccessIterator>* sd,
                    _DifferenceTp& num_samples)
  {
    typedef _DifferenceTp difference_type;

-    PMWMSSortingData<RandomAccessIterator>* sd = d->sd;
+    thread_index_t iam = omp_get_thread_num();

-    num_samples = Settings::sort_mwms_oversampling * d->num_threads - 1;
+    num_samples =
+        Settings::sort_mwms_oversampling * sd->num_threads - 1;

-    difference_type* es = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_samples + 2)));
+    difference_type* es = new difference_type[num_samples + 2];

-    equally_split(sd->starts[d->iam + 1] - sd->starts[d->iam], num_samples + 1, es);
+    equally_split(sd->starts[iam + 1] - sd->starts[iam], 
+                  num_samples + 1, es);

    for (difference_type i = 0; i < num_samples; i++)
-      sd->samples[d->iam * num_samples + i] = sd->source[sd->starts[d->iam] + es[i + 1]];
+      sd->samples[iam * num_samples + i] =
+          sd->source[sd->starts[iam] + es[i + 1]];
+
+    delete[] es;
  }

-  /** @brief PMWMS code executed by each thread.
-   *  @param d Pointer to thread-local data.
+/** @brief PMWMS code executed by each thread.
+  *  @param sd Pointer to algorithm data.
  *  @param comp Comparator.
  */
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
  inline void 
-  parallel_sort_mwms_pu(PMWMSSorterPU<RandomAccessIterator>* d, 
+  parallel_sort_mwms_pu(PMWMSSortingData<RandomAccessIterator>* sd,
                        Comparator& comp)
  {
    typedef std::iterator_traits<RandomAccessIterator> traits_type;
    typedef typename traits_type::value_type value_type;
    typedef typename traits_type::difference_type difference_type;

-    PMWMSSortingData<RandomAccessIterator>* sd = d->sd;
-    thread_index_t iam = d->iam;
+    thread_index_t iam = omp_get_thread_num();

    // Length of this thread's chunk, before merging.
    difference_type length_local = sd->starts[iam + 1] - sd->starts[iam];
@@ -174,21 +170,25 @@ namespace __gnu_parallel
    typedef value_type* SortingPlacesIterator;

    // Sort in temporary storage, leave space for sentinel.
-    sd->sorting_places[iam] = sd->temporaries[iam] = static_cast<value_type*>(::operator new(sizeof(value_type) * (length_local + 1)));
+    sd->sorting_places[iam] = sd->temporaries[iam] = 
+        static_cast<value_type*>(
+        ::operator new(sizeof(value_type) * (length_local + 1)));

    // Copy there.
-    std::uninitialized_copy(sd->source + sd->starts[iam], sd->source + sd->starts[iam] + length_local, sd->sorting_places[iam]);
+    std::uninitialized_copy(sd->source + sd->starts[iam],
+                            sd->source + sd->starts[iam] + length_local,
+                            sd->sorting_places[iam]);
 #endif

    // Sort locally.
-    if (d->stable)
-      __gnu_sequential::stable_sort(sd->sorting_places[iam], sd->sorting_places[iam] + length_local, comp);
+    if (sd->stable)
+      __gnu_sequential::stable_sort(sd->sorting_places[iam],
+                                    sd->sorting_places[iam] + length_local,
+                                    comp);
    else
-      __gnu_sequential::sort(sd->sorting_places[iam], sd->sorting_places[iam] + length_local, comp);
-
-#if _GLIBCXX_ASSERTIONS
-    _GLIBCXX_PARALLEL_ASSERT(is_sorted(sd->sorting_places[iam], sd->sorting_places[iam] + length_local, comp));
-#endif
+      __gnu_sequential::sort(sd->sorting_places[iam],
+                             sd->sorting_places[iam] + length_local,
+                             comp);

    // Invariant: locally sorted subsequence in sd->sorting_places[iam],
    // sd->sorting_places[iam] + length_local.
@@ -196,22 +196,23 @@ namespace __gnu_parallel
    if (Settings::sort_splitting == Settings::SAMPLING)
      {
        difference_type num_samples;
-	determine_samples(d, num_samples);
+        determine_samples(sd, num_samples);

-#pragma omp barrier
+#       pragma omp barrier

-#pragma omp single
+#       pragma omp single
        __gnu_sequential::sort(sd->samples,
-			       sd->samples + (num_samples * d->num_threads), 
+                               sd->samples + (num_samples * sd->num_threads),
                               comp);

-#pragma omp barrier
+#       pragma omp barrier

-	for (int s = 0; s < d->num_threads; s++)
+        for (int s = 0; s < sd->num_threads; s++)
          {
            // For each sequence.
              if (num_samples * iam > 0)
-	      sd->pieces[iam][s].begin = std::lower_bound(sd->sorting_places[s],
+                sd->pieces[iam][s].begin = 
+                    std::lower_bound(sd->sorting_places[s],
                        sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s],
                        sd->samples[num_samples * iam],
                        comp)
@@ -220,43 +221,47 @@ namespace __gnu_parallel
              // Absolute beginning.
              sd->pieces[iam][s].begin = 0;

-	    if ((num_samples * (iam + 1)) < (num_samples * d->num_threads))
-	      sd->pieces[iam][s].end = std::lower_bound(sd->sorting_places[s],
-							sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s], sd->samples[num_samples * (iam + 1)], comp)
+            if ((num_samples * (iam + 1)) < (num_samples * sd->num_threads))
+              sd->pieces[iam][s].end =
+                  std::lower_bound(sd->sorting_places[s],
+                                   sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s],
+                                   sd->samples[num_samples * (iam + 1)], comp)
                  - sd->sorting_places[s];
            else
              // Absolute end.
              sd->pieces[iam][s].end = sd->starts[s + 1] - sd->starts[s];
            }
-
      }
    else if (Settings::sort_splitting == Settings::EXACT)
      {
-#pragma omp barrier
+#       pragma omp barrier

-	std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> > seqs(d->num_threads);
-	for (int s = 0; s < d->num_threads; s++)
-	  seqs[s] = std::make_pair(sd->sorting_places[s], sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s]);
+        std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> >
+            seqs(sd->num_threads);
+        for (int s = 0; s < sd->num_threads; s++)
+          seqs[s] = std::make_pair(sd->sorting_places[s],
+                                   sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s]);

-	std::vector<SortingPlacesIterator> offsets(d->num_threads);
+        std::vector<SortingPlacesIterator> offsets(sd->num_threads);

-	// If not last thread.
-	if (iam < d->num_threads - 1)
-	  multiseq_partition(seqs.begin(), seqs.end(), sd->starts[iam + 1], offsets.begin(), comp);
+        // if not last thread
+        if (iam < sd->num_threads - 1)
+          multiseq_partition(seqs.begin(), seqs.end(),
+                             sd->starts[iam + 1], offsets.begin(), comp);

-	for (int seq = 0; seq < d->num_threads; seq++)
+        for (int seq = 0; seq < sd->num_threads; seq++)
          {
-	    // For each sequence.
-	    if (iam < (d->num_threads - 1))
+            // for each sequence
+            if (iam < (sd->num_threads - 1))
              sd->pieces[iam][seq].end = offsets[seq] - seqs[seq].first;
            else
-	      // Absolute end of this sequence.
+              // very end of this sequence
              sd->pieces[iam][seq].end = sd->starts[seq + 1] - sd->starts[seq];
          }

-#pragma omp barrier
+#       pragma omp barrier

-	for (int seq = 0; seq < d->num_threads; seq++)
+        for (int seq = 0; seq < sd->num_threads; seq++)
          {
            // For each sequence.
            if (iam > 0)
@@ -269,7 +274,7 @@ namespace __gnu_parallel

    // Offset from target begin, length after merging.
    difference_type offset = 0, length_am = 0;
-    for (int s = 0; s < d->num_threads; s++)
+    for (int s = 0; s < sd->num_threads; s++)
      {
        length_am += sd->pieces[iam][s].end - sd->pieces[iam][s].begin;
        offset += sd->pieces[iam][s].begin;
@@ -279,40 +284,37 @@ namespace __gnu_parallel
    // Merge to temporary storage, uninitialized creation not possible
    // since there is no multiway_merge calling the placement new
    // instead of the assignment operator.
-    sd->merging_places[iam] = sd->temporaries[iam] = static_cast<value_type*>(::operator new(sizeof(value_type) * length_am));
+    sd->merging_places[iam] = sd->temporaries[iam] =
+        static_cast<value_type*>(
+        ::operator new(sizeof(value_type) * length_am));
 #else
    // Merge directly to target.
    sd->merging_places[iam] = sd->source + offset;
 #endif
-    std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> > seqs(d->num_threads);
+    std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> >
+        seqs(sd->num_threads);

-    for (int s = 0; s < d->num_threads; s++)
+    for (int s = 0; s < sd->num_threads; s++)
      {
-	seqs[s] = std::make_pair(sd->sorting_places[s] + sd->pieces[iam][s].begin, sd->sorting_places[s] + sd->pieces[iam][s].end);
-
-#if _GLIBCXX_ASSERTIONS
-	_GLIBCXX_PARALLEL_ASSERT(is_sorted(seqs[s].first, seqs[s].second, comp));
-#endif
+        seqs[s] = std::make_pair(sd->sorting_places[s] + sd->pieces[iam][s].begin,
+                                 sd->sorting_places[s] + sd->pieces[iam][s].end);
      }

-    multiway_merge(seqs.begin(), seqs.end(), sd->merging_places[iam], comp, length_am, d->stable, false, sequential_tag());
-
-#if _GLIBCXX_ASSERTIONS
-    _GLIBCXX_PARALLEL_ASSERT(is_sorted(sd->merging_places[iam], sd->merging_places[iam] + length_am, comp));
-#endif
+    multiway_merge(seqs.begin(), seqs.end(), sd->merging_places[iam], comp, length_am, sd->stable, false, sequential_tag());

 #   pragma omp barrier

 #if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
    // Write back.
-    std::copy(sd->merging_places[iam], sd->merging_places[iam] + length_am, 
+    std::copy(sd->merging_places[iam],
+              sd->merging_places[iam] + length_am,
              sd->source + offset);
 #endif

    delete[] sd->temporaries[iam];
  }

-  /** @brief PMWMS main call.
+/** @brief PMWMS main call.
  *  @param begin Begin iterator of sequence.
  *  @param end End iterator of sequence.
  *  @param comp Comparator.
@@ -320,12 +322,13 @@ namespace __gnu_parallel
  *  @param num_threads Number of threads to use.
  *  @param stable Stable sorting.
  */
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
  inline void
  parallel_sort_mwms(RandomAccessIterator begin, RandomAccessIterator end,
                     Comparator comp, 
                     typename std::iterator_traits<RandomAccessIterator>::difference_type n,
-		     int num_threads, bool stable)
+                     int num_threads,
+                     bool stable)
  {
    _GLIBCXX_CALL(n)

@@ -336,12 +339,21 @@ namespace __gnu_parallel
    if (n <= 1)
      return;

-    // At least one element per thread.
+    // at least one element per thread
    if (num_threads > n)
      num_threads = static_cast<thread_index_t>(n);

+    // shared variables
    PMWMSSortingData<RandomAccessIterator> sd;
+    difference_type* starts;

+#   pragma omp parallel num_threads(num_threads)
+      {
+        num_threads = omp_get_num_threads();  //no more threads than requested
+
+#       pragma omp single
+          {
+            sd.num_threads = num_threads;
            sd.source = begin;
            sd.temporaries = new value_type*[num_threads];

@@ -355,12 +367,10 @@ namespace __gnu_parallel

            if (Settings::sort_splitting == Settings::SAMPLING)
              {
-	unsigned int sz = Settings::sort_mwms_oversampling * num_threads - 1;
-	sz *= num_threads;
-	
-	// Equivalent to value_type[sz], without need of default construction.
-	sz *= sizeof(value_type);
-	sd.samples = static_cast<value_type*>(::operator new(sz));
+                unsigned int size = 
+                    (Settings::sort_mwms_oversampling * num_threads - 1) * num_threads;
+                sd.samples = static_cast<value_type*>(
+                    ::operator new(size * sizeof(value_type)));
              }
            else
              sd.samples = NULL;
@@ -369,28 +379,24 @@ namespace __gnu_parallel
            sd.pieces = new std::vector<Piece<difference_type> >[num_threads];
            for (int s = 0; s < num_threads; s++)
              sd.pieces[s].resize(num_threads);
-    PMWMSSorterPU<RandomAccessIterator>* pus = new PMWMSSorterPU<RandomAccessIterator>[num_threads];
-    difference_type* starts = sd.starts = new difference_type[num_threads + 1];
+            starts = sd.starts = new difference_type[num_threads + 1];
+            sd.stable = stable;

            difference_type chunk_length = n / num_threads;
            difference_type split = n % num_threads;
-    difference_type start = 0;
+            difference_type pos = 0;
            for (int i = 0; i < num_threads; i++)
              {
-	starts[i] = start;
-	start += (i < split) ? (chunk_length + 1) : chunk_length;
-	pus[i].num_threads = num_threads;
-	pus[i].iam = i;
-	pus[i].sd = &sd;
-	pus[i].stable = stable;
+                starts[i] = pos;
+                pos += (i < split) ? (chunk_length + 1) : chunk_length;
+              }
+            starts[num_threads] = pos;
          }
-    starts[num_threads] = start;

        // Now sort in parallel.
-#pragma omp parallel num_threads(num_threads)
-    parallel_sort_mwms_pu(&(pus[omp_get_thread_num()]), comp);
+        parallel_sort_mwms_pu(&sd, comp);
+      } //parallel

-    // XXX sd as RAII
    delete[] starts;
    delete[] sd.temporaries;
    delete[] sd.sorting_places;
@@ -401,10 +407,7 @@ namespace __gnu_parallel

    delete[] sd.offsets;
    delete[] sd.pieces;
-
-    delete[] pus;
  }
-
-}
+} //namespace __gnu_parallel

 #endif
--- a/libstdc++-v3/include/parallel/omp_loop.h
+++ b/libstdc++-v3/include/parallel/omp_loop.h
@@ -43,10 +43,11 @@

 #include <parallel/settings.h>
 #include <parallel/basic_iterator.h>
+#include <parallel/base.h>

 namespace __gnu_parallel
 {
-  /** @brief Embarrassingly parallel algorithm for random access
+/** @brief Embarrassingly parallel algorithm for random access
  * iterators, using an OpenMP for loop.
  *
  *  @param begin Begin iterator of element sequence.
@@ -63,34 +64,50 @@ namespace __gnu_parallel
  *  std::count_n()).
  *  @return User-supplied functor (that may contain a part of the result).
  */
-  template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
+template<typename RandomAccessIterator,
+            typename Op,
+            typename Fu,
+            typename Red,
+            typename Result>
  Op
-  for_each_template_random_access_omp_loop(RandomAccessIterator begin, RandomAccessIterator end, Op o, Fu& f, Red r, Result base, Result& output, typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
+  for_each_template_random_access_omp_loop(
+             RandomAccessIterator begin,
+             RandomAccessIterator end,
+             Op o, Fu& f, Red r, Result base, Result& output,
+             typename std::iterator_traits<RandomAccessIterator>::
+                 difference_type bound)
  {
-    typedef typename std::iterator_traits<RandomAccessIterator>::difference_type difference_type;
+    typedef typename
+        std::iterator_traits<RandomAccessIterator>::difference_type
+        difference_type;

-    thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : static_cast<thread_index_t>((end - begin));
-    Result *thread_results = new Result[num_threads];
    difference_type length = end - begin;
+    thread_index_t num_threads =
+        __gnu_parallel::min<difference_type>(get_max_threads(), length);

-    for (thread_index_t i = 0; i < num_threads; i++)
-      {
-	thread_results[i] = r(thread_results[i], f(o, begin+i));
-      }
+    Result *thread_results;

-#pragma omp parallel num_threads(num_threads)
+#   pragma omp parallel num_threads(num_threads)
      {
-#pragma omp for schedule(dynamic, Settings::workstealing_chunk_size)
-      for (difference_type pos = 0; pos < length; pos++)
+#       pragma omp single
          {
-	  thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos));
-	}
+            num_threads = omp_get_num_threads();
+            thread_results = new Result[num_threads];
+
+            for (thread_index_t i = 0; i < num_threads; i++)
+              thread_results[i] = Result();
          }

+        thread_index_t iam = omp_get_thread_num();
+
+#       pragma omp for schedule(dynamic, Settings::workstealing_chunk_size)
+        for (difference_type pos = 0; pos < length; pos++)
+          thread_results[iam] =
+              r(thread_results[iam], f(o, begin+pos));
+      } //parallel
+
    for (thread_index_t i = 0; i < num_threads; i++)
-      {
        output = r(output, thread_results[i]);
-      }

    delete [] thread_results;

@@ -100,6 +117,7 @@ namespace __gnu_parallel

    return o;
  }
+
 } // end namespace

 #endif
--- a/libstdc++-v3/include/parallel/omp_loop_static.h
+++ b/libstdc++-v3/include/parallel/omp_loop_static.h
@@ -64,39 +64,50 @@ namespace __gnu_parallel
   *  std::count_n()).
   *  @return User-supplied functor (that may contain a part of the result).
   */
-  template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
+template<typename RandomAccessIterator,
+          typename Op,
+          typename Fu,
+          typename Red,
+          typename Result>
  Op
-  for_each_template_random_access_omp_loop_static(RandomAccessIterator begin,
+  for_each_template_random_access_omp_loop_static(
+              RandomAccessIterator begin,
              RandomAccessIterator end,
-						  Op o, Fu& f, Red r,
-						  Result base, Result& output,
-						  typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
+              Op o, Fu& f, Red r, Result base, Result& output,
+              typename std::iterator_traits<RandomAccessIterator>::
+                  difference_type bound)
  {
-    typedef std::iterator_traits<RandomAccessIterator> traits_type;
-    typedef typename traits_type::difference_type difference_type;
+    typedef typename
+        std::iterator_traits<RandomAccessIterator>::difference_type
+        difference_type;

-    thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : (end - begin);
-    Result *thread_results = new Result[num_threads];
    difference_type length = end - begin;
+    thread_index_t num_threads =
+        std::min<difference_type>(get_max_threads(), length);

-    for (thread_index_t i = 0; i < num_threads; i++)
-      {
-	thread_results[i] = r(thread_results[i], f(o, begin+i));
-      }
+    Result *thread_results;

-#pragma omp parallel num_threads(num_threads)
+#   pragma omp parallel num_threads(num_threads)
      {
-#pragma omp for schedule(static, Settings::workstealing_chunk_size)
-      for (difference_type pos = 0; pos < length; pos++)
+#       pragma omp single
          {
-	  thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos));
-	}
+            num_threads = omp_get_num_threads();
+            thread_results = new Result[num_threads];
+
+            for (thread_index_t i = 0; i < num_threads; i++)
+              thread_results[i] = Result();
          }

+        thread_index_t iam = omp_get_thread_num();
+
+#       pragma omp for schedule(static, Settings::workstealing_chunk_size)
+        for (difference_type pos = 0; pos < length; pos++)
+          thread_results[iam] =
+              r(thread_results[iam], f(o, begin+pos));
+      } //parallel
+
    for (thread_index_t i = 0; i < num_threads; i++)
-      {
        output = r(output, thread_results[i]);
-      }

    delete [] thread_results;

@@ -106,6 +117,7 @@ namespace __gnu_parallel

    return o;
  }
+
 } // end namespace

 #endif
--- a/libstdc++-v3/include/parallel/par_loop.h
+++ b/libstdc++-v3/include/parallel/par_loop.h
@@ -41,11 +41,12 @@

 #include <omp.h>
 #include <parallel/settings.h>
+#include <parallel/base.h>

 namespace __gnu_parallel
 {

-  /** @brief Embarrassingly parallel algorithm for random access
+/** @brief Embarrassingly parallel algorithm for random access
  * iterators, using hand-crafted parallelization by equal splitting
  * the work.
  *
@@ -63,47 +64,57 @@ namespace __gnu_parallel
  *  std::count_n()).
  *  @return User-supplied functor (that may contain a part of the result).
  */
-  template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
+template<
+    typename RandomAccessIterator,
+    typename Op,
+    typename Fu,
+    typename Red,
+    typename Result>
  Op
-  for_each_template_random_access_ed(RandomAccessIterator begin,
-				     RandomAccessIterator end, Op o, Fu& f,
-				     Red r, Result base, Result& output,
-				     typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
+  for_each_template_random_access_ed(
+              RandomAccessIterator begin,
+              RandomAccessIterator end,
+              Op o, Fu& f, Red r, Result base, Result& output,
+              typename std::iterator_traits<RandomAccessIterator>::
+                  difference_type bound)
  {
    typedef std::iterator_traits<RandomAccessIterator> traits_type;
    typedef typename traits_type::difference_type difference_type;

    const difference_type length = end - begin;
-    const difference_type settings_threads = static_cast<difference_type>(get_max_threads());
-    const difference_type dmin = settings_threads < length ? settings_threads : length;
-    const difference_type dmax = dmin > 1 ? dmin : 1;
+    Result *thread_results;

-    thread_index_t num_threads = static_cast<thread_index_t>(dmax);
+    thread_index_t num_threads =
+        __gnu_parallel::min<difference_type>(get_max_threads(), length);

+#   pragma omp parallel num_threads(num_threads)
+      {
+#       pragma omp single
+          {
+            num_threads = omp_get_num_threads();
+            thread_results = new Result[num_threads];
+          }

-    Result *thread_results = new Result[num_threads];
+        thread_index_t iam = omp_get_thread_num();

-#pragma omp parallel num_threads(num_threads)
-    {
        // Neutral element.
        Result reduct = Result();

-      thread_index_t p = num_threads;
-      thread_index_t iam = omp_get_thread_num();
-      difference_type start = iam * length / p;
-      difference_type limit = (iam == p - 1) ? length : (iam + 1) * length / p;
+        difference_type
+            start = equally_split_point(length, num_threads, iam),
+            stop = equally_split_point(length, num_threads, iam + 1);

-      if (start < limit)
+        if (start < stop)
          {
            reduct = f(o, begin + start);
-	  start++;
+            ++start;
          }

-      for (; start < limit; start++)
+        for (; start < stop; ++start)
          reduct = r(reduct, f(o, begin + start));

        thread_results[iam] = reduct;
-    }
+      } //parallel

    for (thread_index_t i = 0; i < num_threads; i++)
      output = r(output, thread_results[i]);

--- a/libstdc++-v3/include/parallel/partial_sum.h
+++ b/libstdc++-v3/include/parallel/partial_sum.h
@@ -48,7 +48,7 @@ namespace __gnu_parallel
 {
  // Problem: there is no 0-element given.

-  /** @brief Base case prefix sum routine.
+/** @brief Base case prefix sum routine.
  *  @param begin Begin iterator of input sequence.
  *  @param end End iterator of input sequence.
  *  @param result Begin iterator of output sequence.
@@ -56,9 +56,13 @@ namespace __gnu_parallel
  *  @param value Start value. Must be passed since the neutral
  *  element is unknown in general.
  *  @return End iterator of output sequence. */
-  template<typename InputIterator, typename OutputIterator, typename BinaryOperation>
+template<
+    typename InputIterator,
+    typename OutputIterator,
+    typename BinaryOperation>
  inline OutputIterator
-  parallel_partial_sum_basecase(InputIterator begin, InputIterator end,
+  parallel_partial_sum_basecase(
+            InputIterator begin, InputIterator end,
            OutputIterator result, BinaryOperation bin_op,
            typename std::iterator_traits<InputIterator>::value_type value)
  {
@@ -75,7 +79,7 @@ namespace __gnu_parallel
    return result;
  }

-  /** @brief Parallel partial sum implementation, two-phase approach,
+/** @brief Parallel partial sum implementation, two-phase approach,
    no recursion.
    *  @param begin Begin iterator of input sequence.
    *  @param end End iterator of input sequence.
@@ -85,31 +89,49 @@ namespace __gnu_parallel
    *  @param num_threads Number of threads to use.
    *  @return End iterator of output sequence.
    */
-  template<typename InputIterator, typename OutputIterator, typename BinaryOperation>
+template<
+    typename InputIterator,
+    typename OutputIterator,
+    typename BinaryOperation>
  OutputIterator
-  parallel_partial_sum_linear(InputIterator begin, InputIterator end,
+  parallel_partial_sum_linear(
+            InputIterator begin, InputIterator end,
            OutputIterator result, BinaryOperation bin_op,
-			      typename std::iterator_traits<InputIterator>::difference_type n, int num_threads)
+            typename std::iterator_traits<InputIterator>::difference_type n)
  {
    typedef std::iterator_traits<InputIterator> traits_type;
    typedef typename traits_type::value_type value_type;
    typedef typename traits_type::difference_type difference_type;

-    if (num_threads > (n - 1))
-      num_threads = static_cast<thread_index_t>(n - 1);
+    thread_index_t num_threads =
+        std::min<difference_type>(get_max_threads(), n - 1);
+
    if (num_threads < 2)
      {
        *result = *begin;
-	return parallel_partial_sum_basecase(begin + 1, end, result + 1, bin_op, *begin);
+        return parallel_partial_sum_basecase(
+            begin + 1, end, result + 1, bin_op, *begin);
      }

-    difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 2)));
+    difference_type* borders;
+    value_type* sums;
+
+#   pragma omp parallel num_threads(num_threads)
+      {
+#       pragma omp single
+          {
+            num_threads = omp_get_num_threads();
+
+            borders = new difference_type[num_threads + 2];

            if (Settings::partial_sum_dilatation == 1.0f)
              equally_split(n, num_threads + 1, borders);
            else
              {
-	difference_type chunk_length = (int)((double)n / ((double)num_threads + Settings::partial_sum_dilatation)), borderstart = n - num_threads * chunk_length;
+                difference_type chunk_length =
+                    ((double)n /
+                    ((double)num_threads + Settings::partial_sum_dilatation)),
+                    borderstart = n - num_threads * chunk_length;
                borders[0] = 0;
                for (int i = 1; i < (num_threads + 1); i++)
                  {
@@ -119,13 +141,13 @@ namespace __gnu_parallel
                borders[num_threads + 1] = n;
              }

-    value_type* sums = static_cast<value_type*>(::operator new(sizeof(value_type) * num_threads));
+            sums = static_cast<value_type*>(
+                ::operator new(sizeof(value_type) * num_threads));
            OutputIterator target_end;
+          } //single

-#pragma omp parallel num_threads(num_threads)
-    {
-      int id = omp_get_thread_num();
-      if (id == 0)
+        int iam = omp_get_thread_num();
+        if (iam == 0)
          {
            *result = *begin;
            parallel_partial_sum_basecase(begin + 1, begin + borders[1],
@@ -134,44 +156,48 @@ namespace __gnu_parallel
          }
        else
          {
-	  sums[id] = std::accumulate(begin + borders[id] + 1, 
-				     begin + borders[id + 1], 
-				     *(begin + borders[id]), 
+            sums[iam] = std::accumulate(begin + borders[iam] + 1,
+                            begin + borders[iam + 1],
+                            *(begin + borders[iam]),
                            bin_op, __gnu_parallel::sequential_tag());
          }

-#pragma omp barrier
+#       pragma omp barrier

-#pragma omp single
-      parallel_partial_sum_basecase(sums + 1, sums + num_threads, sums + 1, 
-				    bin_op, sums[0]);
+#       pragma omp single
+          parallel_partial_sum_basecase(
+              sums + 1, sums + num_threads, sums + 1, bin_op, sums[0]);

-#pragma omp barrier
+#       pragma omp barrier

        // Still same team.
-      parallel_partial_sum_basecase(begin + borders[id + 1], 
-				    begin + borders[id + 2], 
-				    result + borders[id + 1], bin_op, 
-				    sums[id]);
-    }
+        parallel_partial_sum_basecase(begin + borders[iam + 1],
+                      begin + borders[iam + 2],
+                      result + borders[iam + 1], bin_op,
+                      sums[iam]);
+      } //parallel

-    delete [] sums;
+    delete[] sums;
+    delete[] borders;

    return result + n;
  }

-  /** @brief Parallel partial sum front-end.
+/** @brief Parallel partial sum front-end.
  *  @param begin Begin iterator of input sequence.
  *  @param end End iterator of input sequence.
  *  @param result Begin iterator of output sequence.
  *  @param bin_op Associative binary function.
  *  @return End iterator of output sequence. */
-  template<typename InputIterator, typename OutputIterator, typename BinaryOperation>
+template<
+    typename InputIterator,
+    typename OutputIterator,
+    typename BinaryOperation>
  OutputIterator
  parallel_partial_sum(InputIterator begin, InputIterator end,
                       OutputIterator result, BinaryOperation bin_op)
  {
-    _GLIBCXX_CALL(begin - end);
+    _GLIBCXX_CALL(begin - end)

    typedef std::iterator_traits<InputIterator> traits_type;
    typedef typename traits_type::value_type value_type;
@@ -179,14 +205,11 @@ namespace __gnu_parallel

    difference_type n = end - begin;

-    int num_threads = get_max_threads();
-
    switch (Settings::partial_sum_algorithm)
      {
      case Settings::LINEAR:
        // Need an initial offset.
-	return parallel_partial_sum_linear(begin, end, result, bin_op,
-					   n, num_threads);
+        return parallel_partial_sum_linear(begin, end, result, bin_op, n);
      default:
    // Partial_sum algorithm not implemented.
        _GLIBCXX_PARALLEL_ASSERT(0);

--- a/libstdc++-v3/include/parallel/partition.h
+++ b/libstdc++-v3/include/parallel/partition.h
@@ -45,21 +45,21 @@
 #include <bits/stl_algo.h>
 #include <parallel/parallel.h>

-/** @brief Decide whether to declare certain variable volatile in this file. */
+/** @brief Decide whether to declare certain variables volatile. */
 #define _GLIBCXX_VOLATILE volatile

 namespace __gnu_parallel
 {
-  /** @brief Parallel implementation of std::partition.
+/** @brief Parallel implementation of std::partition.
  *  @param begin Begin iterator of input sequence to split.
  *  @param end End iterator of input sequence to split.
  *  @param pred Partition predicate, possibly including some kind of pivot.
-   *  @param max_num_threads Maximum number of threads to use for this task.
+  *  @param num_threads Maximum number of threads to use for this task.
  *  @return Number of elements not fulfilling the predicate. */
-  template<typename RandomAccessIterator, typename Predicate>
-  inline typename std::iterator_traits<RandomAccessIterator>::difference_type
+template<typename RandomAccessIterator, typename Predicate>
+  typename std::iterator_traits<RandomAccessIterator>::difference_type
  parallel_partition(RandomAccessIterator begin, RandomAccessIterator end,
-		     Predicate pred, thread_index_t max_num_threads)
+                     Predicate pred, thread_index_t num_threads)
  {
    typedef std::iterator_traits<RandomAccessIterator> traits_type;
    typedef typename traits_type::value_type value_type;
@@ -74,25 +74,37 @@ namespace __gnu_parallel
    _GLIBCXX_VOLATILE difference_type leftover_left, leftover_right;
    _GLIBCXX_VOLATILE difference_type leftnew, rightnew;

-    bool* reserved_left, * reserved_right;
-
-    reserved_left = new bool[max_num_threads];
-    reserved_right = new bool[max_num_threads];
+    bool* reserved_left = NULL, * reserved_right = NULL;

    difference_type chunk_size;
-    if (Settings::partition_chunk_share > 0.0)
-      chunk_size = std::max((difference_type)Settings::partition_chunk_size, (difference_type)((double)n * Settings::partition_chunk_share / (double)max_num_threads));
-    else
-      chunk_size = Settings::partition_chunk_size;

    omp_lock_t result_lock;
    omp_init_lock(&result_lock);

-    // At least good for two processors.
-    while (right - left + 1 >= 2 * max_num_threads * chunk_size)
+    //at least two chunks per thread
+    if(right - left + 1 >= 2 * num_threads * chunk_size)
+#   pragma omp parallel num_threads(num_threads)
+      {
+#       pragma omp single
+          {
+            num_threads = omp_get_num_threads();
+            reserved_left = new bool[num_threads];
+            reserved_right = new bool[num_threads];
+
+            if (Settings::partition_chunk_share > 0.0)
+              chunk_size = std::max<difference_type>(
+                  Settings::partition_chunk_size,
+                  (double)n * Settings::partition_chunk_share /
+                        (double)num_threads);
+            else
+              chunk_size = Settings::partition_chunk_size;
+          }
+
+        while (right - left + 1 >= 2 * num_threads * chunk_size)
+          {
+#           pragma omp single
              {
                difference_type num_chunks = (right - left + 1) / chunk_size;
-	thread_index_t num_threads = (int)std::min((difference_type)max_num_threads, num_chunks / 2);

                for (int r = 0; r < num_threads; r++)
                  {
@@ -101,11 +113,11 @@ namespace __gnu_parallel
                  }
                leftover_left = 0;
                leftover_right = 0;
+              } //implicit barrier

-#pragma omp parallel num_threads(num_threads)
-	{
            // Private.
-	  difference_type thread_left, thread_left_border, thread_right, thread_right_border;
+            difference_type thread_left, thread_left_border,
+                            thread_right, thread_right_border;
            thread_left = left + 1;

            // Just to satisfy the condition below.
@@ -150,12 +162,15 @@ namespace __gnu_parallel
                // Swap as usual.
                while (thread_left < thread_right)
                  {
-		  while (pred(begin[thread_left]) && thread_left <= thread_left_border)
+                    while (pred(begin[thread_left])
+                            && thread_left <= thread_left_border)
                      thread_left++;
-		  while (!pred(begin[thread_right]) && thread_right >= thread_right_border)
+                    while (!pred(begin[thread_right])
+                            && thread_right >= thread_right_border)
                      thread_right--;

-		  if (thread_left > thread_left_border || thread_right < thread_right_border)
+                    if (thread_left > thread_left_border
+                        || thread_right < thread_right_border)
                      // Fetch new chunk(s).
                      break;

@@ -167,28 +182,29 @@ namespace __gnu_parallel

            // Now swap the leftover chunks to the right places.
            if (thread_left <= thread_left_border)
-#pragma omp atomic
+#             pragma omp atomic
              leftover_left++;
            if (thread_right >= thread_right_border)
-#pragma omp atomic
+#             pragma omp atomic
              leftover_right++;

-#pragma omp barrier
+#           pragma omp barrier

-#pragma omp single
+#           pragma omp single
              {
                leftnew = left - leftover_left * chunk_size;
                rightnew = right + leftover_right * chunk_size;
              }

-#pragma omp barrier
+#           pragma omp barrier

            // <=> thread_left_border + (chunk_size - 1) >= leftnew
            if (thread_left <= thread_left_border
                && thread_left_border >= leftnew)
              {
                // Chunk already in place, reserve spot.
-	      reserved_left[(left - (thread_left_border + 1)) / chunk_size] = true;
+                reserved_left[(left - (thread_left_border + 1)) / chunk_size]
+                    = true;
              }

            // <=> thread_right_border - (chunk_size - 1) <= rightnew
@@ -196,12 +212,15 @@ namespace __gnu_parallel
                && thread_right_border <= rightnew)
              {
                // Chunk already in place, reserve spot.
-	      reserved_right[((thread_right_border - 1) - right) / chunk_size] = true;
+                reserved_right
+                    [((thread_right_border - 1) - right) / chunk_size]
+                    = true;
              }

-#pragma omp barrier
+#           pragma omp barrier

-	  if (thread_left <= thread_left_border && thread_left_border < leftnew)
+            if (thread_left <= thread_left_border
+                && thread_left_border < leftnew)
              {
                // Find spot and swap.
                difference_type swapstart = -1;
@@ -219,7 +238,10 @@ namespace __gnu_parallel
                _GLIBCXX_PARALLEL_ASSERT(swapstart != -1);
 #endif

-	      std::swap_ranges(begin + thread_left_border - (chunk_size - 1), begin + thread_left_border + 1, begin + swapstart);
+                std::swap_ranges(
+                    begin + thread_left_border - (chunk_size - 1),
+                    begin + thread_left_border + 1,
+                    begin + swapstart);
              }

            if (thread_right >= thread_right_border
@@ -241,12 +263,14 @@ namespace __gnu_parallel
                _GLIBCXX_PARALLEL_ASSERT(swapstart != -1);
 #endif

-	      std::swap_ranges(begin + thread_right_border, begin + thread_right_border + chunk_size, begin + swapstart);
+                std::swap_ranges(begin + thread_right_border,
+                                begin + thread_right_border + chunk_size,
+                                begin + swapstart);
              }
 #if _GLIBCXX_ASSERTIONS
-#pragma omp barrier
+#             pragma omp barrier

-#pragma omp single
+#             pragma omp single
                {
                  for (int r = 0; r < leftover_left; r++)
                    _GLIBCXX_PARALLEL_ASSERT(reserved_left[r]);
@@ -254,14 +278,16 @@ namespace __gnu_parallel
                    _GLIBCXX_PARALLEL_ASSERT(reserved_right[r]);
                }

-#pragma omp barrier
+#             pragma omp barrier
 #endif

-#pragma omp barrier
+#             pragma omp barrier
+
              left = leftnew;
              right = rightnew;
          }
-      }	// end "recursion"
+#         pragma omp flush(left, right)
+      } // end "recursion" //parallel

    difference_type final_left = left, final_right = right;

@@ -298,14 +324,14 @@ namespace __gnu_parallel
      return final_left + 1;
  }

-  /** 
+/**
  *  @brief Parallel implementation of std::nth_element().
  *  @param begin Begin iterator of input sequence.
  *  @param nth Iterator of element that must be in position afterwards.
  *  @param end End iterator of input sequence.
  *  @param comp Comparator.
  */
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
  void 
  parallel_nth_element(RandomAccessIterator begin, RandomAccessIterator nth, 
 		       RandomAccessIterator end, Comparator comp)
@@ -377,12 +403,12 @@ namespace __gnu_parallel
    __gnu_sequential::sort(begin, end, comp);
  }

-  /** @brief Parallel implementation of std::partial_sort().
-   *  @param begin Begin iterator of input sequence.
-   *  @param middle Sort until this position.
-   *  @param end End iterator of input sequence.
-   *  @param comp Comparator. */
-  template<typename RandomAccessIterator, typename Comparator>
+/** @brief Parallel implementation of std::partial_sort().
+*  @param begin Begin iterator of input sequence.
+*  @param middle Sort until this position.
+*  @param end End iterator of input sequence.
+*  @param comp Comparator. */
+template<typename RandomAccessIterator, typename Comparator>
  void
  parallel_partial_sort(RandomAccessIterator begin, RandomAccessIterator middle, RandomAccessIterator end, Comparator comp)
  {

--- a/libstdc++-v3/include/parallel/quicksort.h
+++ b/libstdc++-v3/include/parallel/quicksort.h
@@ -53,11 +53,17 @@ namespace __gnu_parallel
   *  this part.
   */
  template<typename RandomAccessIterator, typename Comparator>
-  inline typename std::iterator_traits<RandomAccessIterator>::difference_type
-  parallel_sort_qs_divide(RandomAccessIterator begin, RandomAccessIterator end,
+  inline
+  typename std::iterator_traits<RandomAccessIterator>::difference_type
+  parallel_sort_qs_divide(
+      RandomAccessIterator begin,
+      RandomAccessIterator end,
      Comparator comp,
-			  typename std::iterator_traits<RandomAccessIterator>::difference_type pivot_rank,
-			  typename std::iterator_traits<RandomAccessIterator>::difference_type num_samples, thread_index_t num_threads)
+      typename std::iterator_traits<RandomAccessIterator>::difference_type
+          pivot_rank,
+      typename std::iterator_traits<RandomAccessIterator>::difference_type
+          num_samples,
+      thread_index_t num_threads)
  {
    typedef std::iterator_traits<RandomAccessIterator> traits_type;
    typedef typename traits_type::value_type value_type;
@@ -65,20 +71,24 @@ namespace __gnu_parallel

    difference_type n = end - begin;
    num_samples = std::min(num_samples, n);
-    value_type* samples = static_cast<value_type*>(__builtin_alloca(sizeof(value_type) * num_samples));
+
+    // Allocate uninitialized, to avoid default constructor.
+    value_type* samples = static_cast<value_type*>(
+        operator new(num_samples * sizeof(value_type)));

    for (difference_type s = 0; s < num_samples; s++)
      {
        const unsigned long long index = static_cast<unsigned long long>(s)
                        * n / num_samples;
-	samples[s] = begin[index];
+        new(samples + s) value_type(begin[index]);
      }

    __gnu_sequential::sort(samples, samples + num_samples, comp);

    value_type& pivot = samples[pivot_rank * num_samples / n];

-    __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, pivot);
+    __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool>
+        pred(comp, pivot);
    difference_type split = parallel_partition(begin, end, pred, num_threads);

    return split;
@@ -93,7 +103,10 @@ namespace __gnu_parallel
   */
  template<typename RandomAccessIterator, typename Comparator>
  inline void
-  parallel_sort_qs_conquer(RandomAccessIterator begin, RandomAccessIterator end, Comparator comp, int num_threads)
+  parallel_sort_qs_conquer(RandomAccessIterator begin,
+                           RandomAccessIterator end,
+                           Comparator comp,
+                           thread_index_t num_threads)
  {
    typedef std::iterator_traits<RandomAccessIterator> traits_type;
    typedef typename traits_type::value_type value_type;
@@ -110,24 +123,27 @@ namespace __gnu_parallel
    if (n <= 1)
      return;

-    thread_index_t num_processors_left;
+    thread_index_t num_threads_left;

    if ((num_threads % 2) == 1)
-      num_processors_left = num_threads / 2 + 1;
+      num_threads_left = num_threads / 2 + 1;
    else
-      num_processors_left = num_threads / 2;
+      num_threads_left = num_threads / 2;

-    pivot_rank = n * num_processors_left / num_threads;
+    pivot_rank = n * num_threads_left / num_threads;

-    difference_type split = parallel_sort_qs_divide(begin, end, comp, pivot_rank,
-Settings::sort_qs_num_samples_preset, num_threads);
+    difference_type split = parallel_sort_qs_divide(
+        begin, end, comp, pivot_rank,
+        Settings::sort_qs_num_samples_preset, num_threads);

 #pragma omp parallel sections
    {
 #pragma omp section
-      parallel_sort_qs_conquer(begin, begin + split, comp, num_processors_left);
+      parallel_sort_qs_conquer(begin, begin + split,
+                               comp, num_threads_left);
 #pragma omp section
-      parallel_sort_qs_conquer(begin + split, end, comp, num_threads - num_processors_left);
+      parallel_sort_qs_conquer(begin + split, end,
+                               comp, num_threads - num_threads_left);
    }
  }

@@ -143,9 +159,12 @@ Settings::sort_qs_num_samples_preset, num_threads);
   */
  template<typename RandomAccessIterator, typename Comparator>
  inline void
-  parallel_sort_qs(RandomAccessIterator begin, RandomAccessIterator end,
+  parallel_sort_qs(
+      RandomAccessIterator begin,
+      RandomAccessIterator end,
      Comparator comp,
-		   typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads)
+      typename std::iterator_traits<RandomAccessIterator>::difference_type n,
+      int num_threads)
  {
    _GLIBCXX_CALL(n)

@@ -165,10 +184,7 @@ Settings::sort_qs_num_samples_preset, num_threads);
    // Hard to avoid.
    omp_set_num_threads(num_threads);

-    bool old_nested = (omp_get_nested() != 0);
-    omp_set_nested(true);
    parallel_sort_qs_conquer(begin, begin + n, comp, num_threads);
-    omp_set_nested(old_nested);
  }

 } //namespace __gnu_parallel

--- a/libstdc++-v3/include/parallel/random_shuffle.h
+++ b/libstdc++-v3/include/parallel/random_shuffle.h
@@ -45,16 +45,16 @@

 namespace __gnu_parallel
 {
-  /** @brief Type to hold the index of a bin.
+/** @brief Type to hold the index of a bin.
  *
  *  Since many variables of this type are allocated, it should be
  *  chosen as small as possible.
  */
-  typedef unsigned short bin_index;
+typedef unsigned short bin_index;

-  /** @brief Data known to every thread participating in
+/** @brief Data known to every thread participating in
    __gnu_parallel::parallel_random_shuffle(). */
-  template<typename RandomAccessIterator>
+template<typename RandomAccessIterator>
  struct DRandomShufflingGlobalData
  {
    typedef std::iterator_traits<RandomAccessIterator> traits_type;
@@ -90,18 +90,15 @@ namespace __gnu_parallel
    : source(_source) { }
  };

-  /** @brief Local data for a thread participating in
+/** @brief Local data for a thread participating in
    __gnu_parallel::parallel_random_shuffle().
  */
-  template<typename RandomAccessIterator, typename RandomNumberGenerator>
+template<typename RandomAccessIterator, typename RandomNumberGenerator>
  struct DRSSorterPU
  {
    /** @brief Number of threads participating in total. */
    int num_threads;

-    /** @brief Number of owning thread. */
-    int iam;
-
    /** @brief Begin index for bins taken care of by this thread. */
    bin_index bins_begin;

@@ -115,18 +112,18 @@ namespace __gnu_parallel
    DRandomShufflingGlobalData<RandomAccessIterator>* sd;
  };

-  /** @brief Generate a random number in @c [0,2^logp).
+/** @brief Generate a random number in @c [0,2^logp).
  *  @param logp Logarithm (basis 2) of the upper range bound.
  *  @param rng Random number generator to use.
  */
-  template<typename RandomNumberGenerator>
+template<typename RandomNumberGenerator>
  inline int
  random_number_pow2(int logp, RandomNumberGenerator& rng)
  { return rng.genrand_bits(logp); }

-  /** @brief Random shuffle code executed by each thread.
+/** @brief Random shuffle code executed by each thread.
  *  @param pus Array of thread-local data records. */
-  template<typename RandomAccessIterator, typename RandomNumberGenerator>
+template<typename RandomAccessIterator, typename RandomNumberGenerator>
  inline void 
  parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator,
                                 RandomNumberGenerator>* pus)
@@ -135,9 +132,9 @@ namespace __gnu_parallel
    typedef typename traits_type::value_type value_type;
    typedef typename traits_type::difference_type difference_type;

-    DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[omp_get_thread_num()];
+    thread_index_t iam = omp_get_thread_num();
+    DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[iam];
    DRandomShufflingGlobalData<RandomAccessIterator>* sd = d->sd;
-    thread_index_t iam = d->iam;

    // Indexing: dist[bin][processor]
    difference_type length = sd->starts[iam + 1] - sd->starts[iam];
@@ -166,9 +163,9 @@ namespace __gnu_parallel
    for (bin_index b = 0; b < sd->num_bins + 1; b++)
      sd->dist[b][iam + 1] = dist[b];

-#pragma omp barrier
+#   pragma omp barrier

-#pragma omp single
+#   pragma omp single
    {
      // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the
      // total number of items in bin s
@@ -178,13 +175,13 @@ namespace __gnu_parallel
                                      sd->dist[s + 1]);
    }

-#pragma omp barrier
+#   pragma omp barrier

    sequence_index_t offset = 0, global_offset = 0;
    for (bin_index s = 0; s < d->bins_begin; s++)
      global_offset += sd->dist[s + 1][d->num_threads];

-#pragma omp barrier
+#   pragma omp barrier

    for (bin_index s = d->bins_begin; s < d->bins_end; s++)
      {
@@ -193,9 +190,10 @@ namespace __gnu_parallel
 	offset = sd->dist[s + 1][d->num_threads];
      }

-    sd->temporaries[iam] = static_cast<value_type*>(::operator new(sizeof(value_type) * offset));
+    sd->temporaries[iam] = static_cast<value_type*>(
+      ::operator new(sizeof(value_type) * offset));

-#pragma omp barrier
+#   pragma omp barrier

    // Draw local copies to avoid false sharing.
    for (bin_index b = 0; b < sd->num_bins + 1; b++)
@@ -223,23 +221,27 @@ namespace __gnu_parallel
    delete[] bin_proc;
    delete[] temporaries;

-#pragma omp barrier
+#   pragma omp barrier

    // Shuffle bins internally.
    for (bin_index b = d->bins_begin; b < d->bins_end; b++)
      {
-	value_type* begin = sd->temporaries[iam] + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]),
-	  * end = sd->temporaries[iam] + sd->dist[b + 1][d->num_threads];
+        value_type* begin =
+                    sd->temporaries[iam] +
+                    ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]),
+                  * end =
+                    sd->temporaries[iam] + sd->dist[b + 1][d->num_threads];
        sequential_random_shuffle(begin, end, rng);
-	std::copy(begin, end, sd->source + global_offset + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]));
+        std::copy(begin, end, sd->source + global_offset +
+            ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]));
      }

    delete[] sd->temporaries[iam];
  }

-  /** @brief Round up to the next greater power of 2.
+/** @brief Round up to the next greater power of 2.
  *  @param x Integer to round up */
-  template<typename T>
+template<typename T>
  T 
  round_up_to_pow2(T x)
  {
@@ -249,16 +251,21 @@ namespace __gnu_parallel
      return (T)1 << (log2(x - 1) + 1);
  }

-  /** @brief Main parallel random shuffle step.
+/** @brief Main parallel random shuffle step.
  *  @param begin Begin iterator of sequence.
  *  @param end End iterator of sequence.
  *  @param n Length of sequence.
  *  @param num_threads Number of threads to use.
  *  @param rng Random number generator to use.
  */
-  template<typename RandomAccessIterator, typename RandomNumberGenerator>
+template<typename RandomAccessIterator, typename RandomNumberGenerator>
  inline void
-  parallel_random_shuffle_drs(RandomAccessIterator begin, RandomAccessIterator end, typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads, RandomNumberGenerator& rng)
+  parallel_random_shuffle_drs(
+      RandomAccessIterator begin,
+      RandomAccessIterator end,
+      typename std::iterator_traits<RandomAccessIterator>::difference_type n,
+      thread_index_t num_threads,
+      RandomNumberGenerator& rng)
  {
    typedef std::iterator_traits<RandomAccessIterator> traits_type;
    typedef typename traits_type::value_type value_type;
@@ -275,16 +282,17 @@ namespace __gnu_parallel
    // Try the L1 cache first.

    // Must fit into L1.
-    num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type))));
+    num_bins_cache = std::max<difference_type>(
+        1, n / (Settings::L1_cache_size_lb / sizeof(value_type)));
    num_bins_cache = round_up_to_pow2(num_bins_cache);

    // No more buckets than TLB entries, power of 2
    // Power of 2 and at least one element per bin, at most the TLB size.
-    num_bins = std::min(n, (difference_type)num_bins_cache);
+    num_bins = std::min<difference_type>(n, num_bins_cache);

 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
    // 2 TLB entries needed per bin.
-    num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
+    num_bins = std::min<difference_type>(Settings::TLB_size / 2, num_bins);
 #endif
    num_bins = round_up_to_pow2(num_bins);

@@ -293,32 +301,41 @@ namespace __gnu_parallel
 #endif
        // Now try the L2 cache
        // Must fit into L2
-	num_bins_cache = static_cast<bin_index>(std::max((difference_type)1, (difference_type)(n / (Settings::L2_cache_size / sizeof(value_type)))));
+        num_bins_cache = static_cast<bin_index>(std::max<difference_type>(
+            1, n / (Settings::L2_cache_size / sizeof(value_type))));
        num_bins_cache = round_up_to_pow2(num_bins_cache);

        // No more buckets than TLB entries, power of 2.
-	num_bins = static_cast<bin_index>(std::min(n, (difference_type)num_bins_cache));
+        num_bins = static_cast<bin_index>(
+            std::min(n, static_cast<difference_type>(num_bins_cache)));
        // Power of 2 and at least one element per bin, at most the TLB size.
 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
        // 2 TLB entries needed per bin.
-	num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
+        num_bins = std::min(
+            static_cast<difference_type>(Settings::TLB_size / 2), num_bins);
 #endif
          num_bins = round_up_to_pow2(num_bins);
 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
      }
 #endif

-    num_threads = std::min((bin_index)num_threads, (bin_index)num_bins);
+    num_threads = std::min<bin_index>(num_threads, num_bins);

    if (num_threads <= 1)
      return sequential_random_shuffle(begin, end, rng);

    DRandomShufflingGlobalData<RandomAccessIterator> sd(begin);
+    DRSSorterPU<RandomAccessIterator, random_number >* pus;
+    difference_type* starts;

-    DRSSorterPU<RandomAccessIterator, random_number >* pus = new DRSSorterPU<RandomAccessIterator, random_number >[num_threads];
+#   pragma omp parallel num_threads(num_threads)
+      {
+#       pragma omp single
+          {
+            pus = new DRSSorterPU<RandomAccessIterator, random_number>
+                [num_threads];

            sd.temporaries = new value_type*[num_threads];
-    //sd.oracles = new bin_index[n];
            sd.dist = new difference_type*[num_bins + 1];
            sd.bin_proc = new thread_index_t[num_bins];
            for (bin_index b = 0; b < num_bins + 1; b++)
@@ -328,34 +345,36 @@ namespace __gnu_parallel
                sd.dist[0][0] = 0;
                sd.dist[b][0] = 0;
              }
-    difference_type* starts = sd.starts = new difference_type[num_threads + 1];
+            starts = sd.starts = new difference_type[num_threads + 1];
            int bin_cursor = 0;
            sd.num_bins = num_bins;
            sd.num_bits = log2(num_bins);

-    difference_type chunk_length = n / num_threads, split = n % num_threads, start = 0;
-    int bin_chunk_length = num_bins / num_threads, bin_split = num_bins % num_threads;
-    for (int i = 0; i < num_threads; i++)
+            difference_type chunk_length = n / num_threads,
+                            split = n % num_threads, start = 0;
+            difference_type bin_chunk_length = num_bins / num_threads,
+                            bin_split = num_bins % num_threads;
+            for (thread_index_t i = 0; i < num_threads; i++)
              {
                starts[i] = start;
                start += (i < split) ? (chunk_length + 1) : chunk_length;
                int j = pus[i].bins_begin = bin_cursor;

                // Range of bins for this processor.
-	bin_cursor += (i < bin_split) ? (bin_chunk_length + 1) : bin_chunk_length;
+                bin_cursor += (i < bin_split) ?
+                    (bin_chunk_length + 1) : bin_chunk_length;
                pus[i].bins_end = bin_cursor;
                for (; j < bin_cursor; j++)
                  sd.bin_proc[j] = i;
                pus[i].num_threads = num_threads;
-	pus[i].iam = i;
                pus[i].seed = rng(std::numeric_limits<uint32>::max());
                pus[i].sd = &sd;
              }
            starts[num_threads] = start;
-
+          } //single
      // Now shuffle in parallel.
-#pragma omp parallel num_threads(num_threads)
      parallel_random_shuffle_drs_pu(pus);
+    }

    delete[] starts;
    delete[] sd.bin_proc;
@@ -367,12 +386,12 @@ namespace __gnu_parallel
    delete[] pus;
  }

-  /** @brief Sequential cache-efficient random shuffle.
+/** @brief Sequential cache-efficient random shuffle.
 *  @param begin Begin iterator of sequence.
 *  @param end End iterator of sequence.
 *  @param rng Random number generator to use.
 */
-  template<typename RandomAccessIterator, typename RandomNumberGenerator>
+template<typename RandomAccessIterator, typename RandomNumberGenerator>
  inline void
  sequential_random_shuffle(RandomAccessIterator begin, 
                            RandomAccessIterator end,
@@ -388,7 +407,9 @@ namespace __gnu_parallel

 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
    // Try the L1 cache first, must fit into L1.
-    num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type))));
+    num_bins_cache =
+        std::max<difference_type>
+            (1, n / (Settings::L1_cache_size_lb / sizeof(value_type)));
    num_bins_cache = round_up_to_pow2(num_bins_cache);

    // No more buckets than TLB entries, power of 2
@@ -404,16 +425,20 @@ namespace __gnu_parallel
      {
 #endif
        // Now try the L2 cache, must fit into L2.
-	num_bins_cache = static_cast<bin_index>(std::max((difference_type)1, (difference_type)(n / (Settings::L2_cache_size / sizeof(value_type)))));
+        num_bins_cache =
+            static_cast<bin_index>(std::max<difference_type>(
+                1, n / (Settings::L2_cache_size / sizeof(value_type))));
        num_bins_cache = round_up_to_pow2(num_bins_cache);

        // No more buckets than TLB entries, power of 2
        // Power of 2 and at least one element per bin, at most the TLB size.
-	num_bins = static_cast<bin_index>(std::min(n, (difference_type)num_bins_cache));
+        num_bins = static_cast<bin_index>
+            (std::min(n, static_cast<difference_type>(num_bins_cache)));

 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
        // 2 TLB entries needed per bin
-	num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
+        num_bins =
+            std::min<difference_type>(Settings::TLB_size / 2, num_bins);
 #endif
        num_bins = round_up_to_pow2(num_bins);
 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
@@ -424,9 +449,11 @@ namespace __gnu_parallel

    if (num_bins > 1)
      {
-	value_type* target = static_cast<value_type*>(::operator new(sizeof(value_type) * n));
+        value_type* target = static_cast<value_type*>(
+          ::operator new(sizeof(value_type) * n));
        bin_index* oracles = new bin_index[n];
-	difference_type* dist0 = new difference_type[num_bins + 1], * dist1 = new difference_type[num_bins + 1];
+        difference_type* dist0 = new difference_type[num_bins + 1],
+                       * dist1 = new difference_type[num_bins + 1];

        for (int b = 0; b < num_bins + 1; b++)
          dist0[b] = 0;
@@ -454,7 +481,8 @@ namespace __gnu_parallel

        for (int b = 0; b < num_bins; b++)
          {
-	    sequential_random_shuffle(target + dist1[b], target + dist1[b + 1],
+            sequential_random_shuffle(target + dist1[b],
+                                      target + dist1[b + 1],
                                      rng);
          }

@@ -467,14 +495,15 @@ namespace __gnu_parallel
      __gnu_sequential::random_shuffle(begin, end, rng);
  }

-  /** @brief Parallel random public call.
+/** @brief Parallel random public call.
 *  @param begin Begin iterator of sequence.
 *  @param end End iterator of sequence.
 *  @param rng Random number generator to use.
 */
-  template<typename RandomAccessIterator, typename RandomNumberGenerator>
+template<typename RandomAccessIterator, typename RandomNumberGenerator>
  inline void
-  parallel_random_shuffle(RandomAccessIterator begin, RandomAccessIterator end,
+  parallel_random_shuffle(RandomAccessIterator begin,
+                          RandomAccessIterator end,
                          RandomNumberGenerator rng = random_number())
  {
    typedef std::iterator_traits<RandomAccessIterator> traits_type;

--- a/libstdc++-v3/include/parallel/search.h
+++ b/libstdc++-v3/include/parallel/search.h
@@ -53,7 +53,7 @@ namespace __gnu_parallel
   *  @param length Length of sequence to search for.
   *  @param advances Returned offsets. 
   */
-  template<typename RandomAccessIterator, typename _DifferenceTp>
+template<typename RandomAccessIterator, typename _DifferenceTp>
  void
  calc_borders(RandomAccessIterator elements, _DifferenceTp length, 
              _DifferenceTp* off)
@@ -81,7 +81,10 @@ namespace __gnu_parallel
   *  @param end2 End iterator of second sequence.
   *  @param pred Find predicate.
   *  @return Place of finding in first sequences. */
-  template<typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename Pred>
+template<
+    typename _RandomAccessIterator1,
+    typename _RandomAccessIterator2,
+    typename Pred>
  _RandomAccessIterator1
  search_template(_RandomAccessIterator1 begin1, _RandomAccessIterator1 end1,
                  _RandomAccessIterator2 begin2, _RandomAccessIterator2 end2,
@@ -103,27 +106,34 @@ namespace __gnu_parallel

    // Where is first occurrence of pattern? defaults to end.
    difference_type result = (end1 - begin1);
+    difference_type *splitters;

    // Pattern too long.
    if (input_length < 0)
      return end1;

-    thread_index_t num_threads = std::max<difference_type>(1, std::min<difference_type>(input_length, __gnu_parallel::get_max_threads()));
-
    omp_lock_t result_lock;
    omp_init_lock(&result_lock);

-    difference_type borders[num_threads + 1];
-    __gnu_parallel::equally_split(input_length, num_threads, borders);
+    thread_index_t num_threads =
+        std::max<difference_type>(1,
+            std::min<difference_type>(input_length, get_max_threads()));

    difference_type advances[pattern_length];
    calc_borders(begin2, pattern_length, advances);

-#pragma omp parallel num_threads(num_threads)
+#   pragma omp parallel num_threads(num_threads)
+      {
+#       pragma omp single
          {
+            num_threads = omp_get_num_threads();
+            splitters = new difference_type[num_threads + 1];
+            equally_split(input_length, num_threads, splitters);
+          }
+
        thread_index_t iam = omp_get_thread_num();

-      difference_type start = borders[iam], stop = borders[iam + 1];
+        difference_type start = splitters[iam], stop = splitters[iam + 1];

        difference_type pos_in_pattern = 0;
        bool found_pattern = false;
@@ -131,11 +141,12 @@ namespace __gnu_parallel
        while (start <= stop && !found_pattern)
          {
            // Get new value of result.
-#pragma omp flush(result)
+            #pragma omp flush(result)
            // No chance for this thread to find first occurrence.
            if (result < start)
              break;
-	  while (pred(begin1[start + pos_in_pattern], begin2[pos_in_pattern]))
+            while (pred(begin1[start + pos_in_pattern],
+                         begin2[pos_in_pattern]))
              {
                ++pos_in_pattern;
                if (pos_in_pattern == pattern_length)
@@ -151,12 +162,15 @@ namespace __gnu_parallel
              }
            // Make safe jump.
            start += (pos_in_pattern - advances[pos_in_pattern]);
-	  pos_in_pattern = (advances[pos_in_pattern] < 0) ? 0 : advances[pos_in_pattern];
-	}
+            pos_in_pattern =
+                (advances[pos_in_pattern] < 0) ? 0 : advances[pos_in_pattern];
          }
+      } //parallel

    omp_destroy_lock(&result_lock);

+    delete[] splitters;
+
    // Return iterator on found element.
    return (begin1 + result);
  }

--- a/libstdc++-v3/include/parallel/set_operations.h
+++ b/libstdc++-v3/include/parallel/set_operations.h
@@ -47,7 +47,7 @@

 namespace __gnu_parallel
 {
-  template<typename InputIterator, typename OutputIterator>
+template<typename InputIterator, typename OutputIterator>
  inline OutputIterator
  copy_tail(std::pair<InputIterator, InputIterator> b,
            std::pair<InputIterator, InputIterator> e, OutputIterator r)
@@ -68,7 +68,10 @@ namespace __gnu_parallel
    return r;
  }

-  template<typename InputIterator, typename OutputIterator, typename Comparator>
+template<
+    typename InputIterator,
+    typename OutputIterator,
+    typename Comparator>
  struct symmetric_difference_func
  {
    typedef std::iterator_traits<InputIterator> traits_type;
@@ -107,7 +110,8 @@ namespace __gnu_parallel
    }

    inline difference_type
-    count(InputIterator a, InputIterator b, InputIterator c, InputIterator d) const
+    count(InputIterator a, InputIterator b, InputIterator c, InputIterator d)
+        const
    {
      difference_type counter = 0;

@@ -144,7 +148,10 @@ namespace __gnu_parallel
  };


-  template<typename InputIterator, typename OutputIterator, typename Comparator>
+template<
+    typename InputIterator,
+    typename OutputIterator,
+    typename Comparator>
  struct difference_func
  {
    typedef std::iterator_traits<InputIterator> traits_type;
@@ -179,7 +186,8 @@ namespace __gnu_parallel
    }

    inline difference_type
-    count(InputIterator a, InputIterator b, InputIterator c, InputIterator d) const
+    count(InputIterator a, InputIterator b, InputIterator c, InputIterator d)
+        const
    {
      difference_type counter = 0;

@@ -209,7 +217,10 @@ namespace __gnu_parallel
  };


-  template<typename InputIterator, typename OutputIterator, typename Comparator>
+template<
+    typename InputIterator,
+    typename OutputIterator,
+    typename Comparator>
  struct intersection_func
  {
    typedef std::iterator_traits<InputIterator> traits_type;
@@ -243,7 +254,8 @@ namespace __gnu_parallel
    }

    inline difference_type
-    count(InputIterator a, InputIterator b, InputIterator c, InputIterator d) const
+    count(InputIterator a, InputIterator b, InputIterator c, InputIterator d)
+        const
    {
      difference_type counter = 0;

@@ -273,10 +285,11 @@ namespace __gnu_parallel
    { return out; }
  };

-  template<class InputIterator, class OutputIterator, class Comparator>
+template<class InputIterator, class OutputIterator, class Comparator>
  struct union_func
  {
-    typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
+    typedef typename std::iterator_traits<InputIterator>::difference_type
+        difference_type;

    union_func(Comparator c) : comp(c) {}

@@ -310,8 +323,8 @@ namespace __gnu_parallel
    }

    inline difference_type
-    count(InputIterator a, const InputIterator b, InputIterator c,
-	  const InputIterator d) const
+    count(InputIterator a, InputIterator b, InputIterator c, InputIterator d)
+        const
    {
      difference_type counter = 0;

@@ -343,7 +356,10 @@ namespace __gnu_parallel
    { return std::copy(a, b, out); }
  };

-  template<typename InputIterator, typename OutputIterator, typename Operation>
+template<
+    typename InputIterator,
+    typename OutputIterator,
+    typename Operation>
  OutputIterator
  parallel_set_operation(InputIterator begin1, InputIterator end1,
                         InputIterator begin2, InputIterator end2,
@@ -355,7 +371,6 @@ namespace __gnu_parallel
    typedef typename traits_type::difference_type difference_type;
    typedef typename std::pair<InputIterator, InputIterator> iterator_pair;

-
    if (begin1 == end1)
      return op.first_empty(begin2, end2, result);

@@ -364,27 +379,35 @@ namespace __gnu_parallel

    const difference_type size = (end1 - begin1) + (end2 - begin2);

-    thread_index_t num_threads = std::min<difference_type>(std::min(end1 - begin1, end2 - begin2), get_max_threads());
-
-    difference_type borders[num_threads + 2];
-    equally_split(size, num_threads + 1, borders);
+    const iterator_pair sequence[ 2 ] =
+        { std::make_pair(begin1, end1), std::make_pair(begin2, end2) } ;
+    OutputIterator return_value = result;
+    difference_type *borders;
+    iterator_pair *block_begins;
+    difference_type* lengths;

-    const iterator_pair sequence[ 2 ] = { std::make_pair(begin1, end1), std::make_pair(begin2, end2) } ;
+    thread_index_t num_threads =
+        std::min<difference_type>(get_max_threads(),
+            std::min(end1 - begin1, end2 - begin2));

-    iterator_pair block_begins[num_threads + 1];
+#   pragma omp parallel num_threads(num_threads)
+      {
+#       pragma omp single
+          {
+            num_threads = omp_get_num_threads();

+            borders = new difference_type[num_threads + 2];
+            equally_split(size, num_threads + 1, borders);
+            block_begins = new iterator_pair[num_threads + 1];
            // Very start.
            block_begins[0] = std::make_pair(begin1, begin2);
-    difference_type length[num_threads];
+            lengths = new difference_type[num_threads];
+          } //single

-    OutputIterator return_value = result;
+        thread_index_t iam = omp_get_thread_num();

-#pragma omp parallel num_threads(num_threads)
-    {
        // Result from multiseq_partition.
        InputIterator offset[2];
-      const int iam = omp_get_thread_num();
-
        const difference_type rank = borders[iam + 1];

        multiseq_partition(sequence, sequence + 2, rank, offset, op.comp);
@@ -401,10 +424,11 @@ namespace __gnu_parallel
            --offset[ 0 ];
          }

-      iterator_pair block_end = block_begins[ iam + 1 ] = iterator_pair(offset[ 0 ], offset[ 1 ]);
+        iterator_pair block_end = block_begins[ iam + 1 ] =
+            iterator_pair(offset[ 0 ], offset[ 1 ]);

        // Make sure all threads have their block_begin result written out.
-#pragma omp barrier
+#       pragma omp barrier

        iterator_pair block_begin = block_begins[ iam ];

@@ -413,16 +437,19 @@ namespace __gnu_parallel
        if (iam == 0)
          {
            // The first thread can copy already.
-	  length[ iam ] = op.invoke(block_begin.first, block_end.first, block_begin.second, block_end.second, result) - result;
+            lengths[ iam ] = op.invoke(block_begin.first, block_end.first,
+                                       block_begin.second, block_end.second,
+                                       result)
+                              - result;
          }
        else
          {
-	  length[ iam ] = op.count(block_begin.first, block_end.first,
+            lengths[ iam ] = op.count(block_begin.first, block_end.first,
                        block_begin.second, block_end.second);
          }

        // Make sure everyone wrote their lengths.
-#pragma omp barrier
+#       pragma omp barrier

        OutputIterator r = result;

@@ -430,18 +457,19 @@ namespace __gnu_parallel
          {
            // Do the last block.
            for (int i = 0; i < num_threads; ++i)
-	    r += length[i];
+              r += lengths[i];

            block_begin = block_begins[num_threads];

            // Return the result iterator of the last block.
-	  return_value = op.invoke(block_begin.first, end1, block_begin.second, end2, r);
+            return_value = op.invoke(
+                block_begin.first, end1, block_begin.second, end2, r);

          }
        else
          {
            for (int i = 0; i < iam; ++i)
-	    r += length[ i ];
+              r += lengths[ i ];

            // Reset begins for copy pass.
            op.invoke(block_begin.first, block_end.first,
@@ -452,7 +480,10 @@ namespace __gnu_parallel
  }


-  template<typename InputIterator, typename OutputIterator, typename Comparator>
+template<
+    typename InputIterator,
+    typename OutputIterator,
+    typename Comparator>
  OutputIterator
  parallel_set_union(InputIterator begin1, InputIterator end1,
                     InputIterator begin2, InputIterator end2,
@@ -462,7 +493,10 @@ namespace __gnu_parallel
        union_func< InputIterator, OutputIterator, Comparator>(comp));
  }

-  template<typename InputIterator, typename OutputIterator, typename Comparator>
+template<
+    typename InputIterator,
+    typename OutputIterator,
+    typename Comparator>
  OutputIterator
  parallel_set_intersection(InputIterator begin1, InputIterator end1,
                            InputIterator begin2, InputIterator end2,
@@ -473,9 +507,11 @@ namespace __gnu_parallel
  }


-  template<typename InputIterator, typename OutputIterator>
+template<typename InputIterator, typename OutputIterator>
  OutputIterator
-  set_intersection(InputIterator begin1, InputIterator end1, InputIterator begin2, InputIterator end2, OutputIterator result)
+  set_intersection(InputIterator begin1, InputIterator end1,
+                   InputIterator begin2, InputIterator end2,
+                   OutputIterator result)
  {
    typedef std::iterator_traits<InputIterator> traits_type;
    typedef typename traits_type::value_type value_type;
@@ -484,7 +520,10 @@ namespace __gnu_parallel
                  std::less<value_type>());
  }

-  template<typename InputIterator, typename OutputIterator, typename Comparator>
+template<
+    typename InputIterator,
+    typename OutputIterator,
+    typename Comparator>
  OutputIterator
  parallel_set_difference(InputIterator begin1, InputIterator end1,
                          InputIterator begin2, InputIterator end2,
@@ -494,22 +533,20 @@ namespace __gnu_parallel
        difference_func<InputIterator, OutputIterator, Comparator>(comp));
  }

-  template<typename InputIterator, typename OutputIterator, typename Comparator>
+template<
+    typename InputIterator,
+    typename OutputIterator,
+    typename Comparator>
  OutputIterator
-  parallel_set_symmetric_difference(InputIterator begin1, InputIterator end1, InputIterator begin2, InputIterator end2, OutputIterator result, Comparator comp)
+  parallel_set_symmetric_difference(InputIterator begin1, InputIterator end1,
+                                    InputIterator begin2, InputIterator end2,
+                                    OutputIterator result, Comparator comp)
  {
    return parallel_set_operation(begin1, end1, begin2, end2, result,
-				  symmetric_difference_func<InputIterator, OutputIterator, Comparator>(comp));
+        symmetric_difference_func<InputIterator, OutputIterator, Comparator>
+            (comp));
  }

 }

 #endif // _GLIBCXX_SET_ALGORITHM_
-
-
-
-
-
-
-
-
--- a/libstdc++-v3/include/parallel/unique_copy.h
+++ b/libstdc++-v3/include/parallel/unique_copy.h
@@ -44,13 +44,16 @@
 namespace __gnu_parallel
 {

-  /** @brief Parallel std::unique_copy(), without explicit equality predicate.
+/** @brief Parallel std::unique_copy(), w/o explicit equality predicate.
  *  @param first Begin iterator of input sequence.
  *  @param last End iterator of input sequence.
  *  @param result Begin iterator of result sequence.
  *  @param binary_pred Equality predicate.
  *  @return End iterator of result sequence. */
-  template<typename InputIterator, class OutputIterator, class BinaryPredicate>
+template<
+    typename InputIterator,
+    class OutputIterator,
+    class BinaryPredicate>
  inline OutputIterator
  parallel_unique_copy(InputIterator first, InputIterator last,
                       OutputIterator result, BinaryPredicate binary_pred)
@@ -62,20 +65,27 @@ namespace __gnu_parallel
    typedef typename traits_type::difference_type difference_type;

    difference_type size = last - first;
-    int num_threads = __gnu_parallel::get_max_threads();
-    difference_type counter[num_threads + 1];

    if (size == 0)
      return result;

    // Let the first thread process two parts.
-    difference_type borders[num_threads + 2];
-    __gnu_parallel::equally_split(size, num_threads + 1, borders);
+    difference_type *counter;
+    difference_type *borders;

+    thread_index_t num_threads = get_max_threads();
    // First part contains at least one element.
-#pragma omp parallel num_threads(num_threads)
+#   pragma omp parallel num_threads(num_threads)
      {
-      int iam = omp_get_thread_num();
+#       pragma omp single
+          {
+                num_threads = omp_get_num_threads();
+                borders = new difference_type[num_threads + 2];
+                equally_split(size, num_threads + 1, borders);
+                counter = new difference_type[num_threads + 1];
+          }
+
+        thread_index_t iam = omp_get_thread_num();

        difference_type begin, end;

@@ -83,6 +93,7 @@ namespace __gnu_parallel
        // Needed for position in output
        difference_type i = 0;
        OutputIterator out = result;
+
        if (iam == 0)
        {
          begin = borders[0] + 1;	// == 1
@@ -120,7 +131,7 @@ namespace __gnu_parallel
      // Last part still untouched.
      difference_type begin_output;

-#pragma omp barrier
+#     pragma omp barrier

      // Store result in output on calculated positions.
      begin_output = 0;
@@ -170,15 +181,17 @@ namespace __gnu_parallel
    for (int t = 0; t < num_threads + 1; t++)
      end_output += counter[t];

+    delete[] borders;
+
    return result + end_output;
  }

-  /** @brief Parallel std::unique_copy(), without explicit equality predicate
+/** @brief Parallel std::unique_copy(), without explicit equality predicate
  *  @param first Begin iterator of input sequence.
  *  @param last End iterator of input sequence.
  *  @param result Begin iterator of result sequence.
  *  @return End iterator of result sequence. */
-  template<typename InputIterator, class OutputIterator>
+template<typename InputIterator, class OutputIterator>
  inline OutputIterator
  parallel_unique_copy(InputIterator first, InputIterator last,
                       OutputIterator result)

--- a/libstdc++-v3/include/parallel/workstealing.h
+++ b/libstdc++-v3/include/parallel/workstealing.h
@@ -55,8 +55,8 @@ namespace __gnu_parallel

 #define _GLIBCXX_JOB_VOLATILE volatile

-  /** @brief One job for a certain thread. */
-  template<typename _DifferenceTp>
+/** @brief One job for a certain thread. */
+template<typename _DifferenceTp>
  struct Job
  {
    typedef _DifferenceTp difference_type;
@@ -78,7 +78,7 @@ namespace __gnu_parallel
    _GLIBCXX_JOB_VOLATILE difference_type load;
  };

-  /** @brief Work stealing algorithm for random access iterators.
+/** @brief Work stealing algorithm for random access iterators.
  *
  *  Uses O(1) additional memory. Synchronization at job lists is
  *  done with atomic operations.
@@ -96,13 +96,20 @@ namespace __gnu_parallel
  *  std::count_n()).
  *  @return User-supplied functor (that may contain a part of the result).
  */
-  template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
+template<
+    typename RandomAccessIterator,
+    typename Op,
+    typename Fu,
+    typename Red,
+    typename Result>
  Op
-  for_each_template_random_access_workstealing(RandomAccessIterator begin,
+  for_each_template_random_access_workstealing(
+      RandomAccessIterator begin,
      RandomAccessIterator end,
      Op op, Fu& f, Red r,
      Result base, Result& output,
-					       typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
+      typename std::iterator_traits<RandomAccessIterator>::difference_type
+          bound)
  {
    _GLIBCXX_CALL(end - begin)

@@ -110,34 +117,43 @@ namespace __gnu_parallel
    typedef typename traits_type::difference_type difference_type;


-    difference_type chunk_size = static_cast<difference_type>(Settings::workstealing_chunk_size);
+    difference_type chunk_size =
+        static_cast<difference_type>(Settings::workstealing_chunk_size);

    // How many jobs?
    difference_type length = (bound < 0) ? (end - begin) : bound;

    // To avoid false sharing in a cache line.
-    const int stride = Settings::cache_line_size * 10 / sizeof(Job<difference_type>) + 1;
+    const int stride =
+        Settings::cache_line_size * 10 / sizeof(Job<difference_type>) + 1;

    // Total number of threads currently working.
    thread_index_t busy = 0;
-    thread_index_t num_threads = get_max_threads();
-    difference_type num_threads_min = num_threads < end - begin ? num_threads : end - begin;
+
+    Job<difference_type> *job;

    omp_lock_t output_lock;
    omp_init_lock(&output_lock);

-    // No more threads than jobs, at least one thread.
-    difference_type num_threads_max = num_threads_min > 1 ? num_threads_min : 1;
-    num_threads = static_cast<thread_index_t>(num_threads_max);
-
-    // Create job description array.
-    Job<difference_type> *job = new Job<difference_type>[num_threads * stride];
-
    // Write base value to output.
    output = base;

-#pragma omp parallel shared(busy) num_threads(num_threads)
+    // No more threads than jobs, at least one thread.
+    thread_index_t num_threads =
+        __gnu_parallel::max<thread_index_t>(1,
+            __gnu_parallel::min<difference_type>(length, get_max_threads()));
+
+#   pragma omp parallel shared(busy) num_threads(num_threads)
      {
+
+#       pragma omp single
+          {
+            num_threads = omp_get_num_threads();
+
+            // Create job description array.
+            job = new Job<difference_type>[num_threads * stride];
+          }
+
        // Initialization phase.

        // Flags for every thread if it is doing productive work.
@@ -158,19 +174,22 @@ namespace __gnu_parallel
        // Number of elements to steal in one attempt.
        difference_type steal;

-      // Every thread has its own random number generator (modulo num_threads).
+        // Every thread has its own random number generator
+        // (modulo num_threads).
        random_number rand_gen(iam, num_threads);

-#pragma omp atomic
        // This thread is currently working.
+#       pragma omp atomic
          busy++;

        iam_working = true;

        // How many jobs per thread? last thread gets the rest.
-      my_job.first = static_cast<difference_type>(iam * (length / num_threads));
+        my_job.first =
+            static_cast<difference_type>(iam * (length / num_threads));

-      my_job.last = (iam == (num_threads - 1)) ? (length - 1) : ((iam + 1) * (length / num_threads) - 1);
+        my_job.last = (iam == (num_threads - 1)) ?
+            (length - 1) : ((iam + 1) * (length / num_threads) - 1);
        my_job.load = my_job.last - my_job.first + 1;

        // Init result with first value (to have a base value for reduction).
@@ -185,26 +204,29 @@ namespace __gnu_parallel

        RandomAccessIterator current;

-#pragma omp barrier
+#       pragma omp barrier

        // Actual work phase
        // Work on own or stolen start
        while (busy > 0)
          {
            // Work until no productive thread left.
-#pragma omp flush(busy)
+#           pragma omp flush(busy)

            // Thread has own work to do
            while (my_job.first <= my_job.last)
              {
                // fetch-and-add call
                // Reserve current job block (size chunk_size) in my queue.
-	      difference_type current_job = fetch_and_add<difference_type>(&(my_job.first), chunk_size);
+                difference_type current_job =
+                  fetch_and_add<difference_type>(&(my_job.first), chunk_size);

                // Update load, to make the three values consistent,
                // first might have been changed in the meantime
                my_job.load = my_job.last - my_job.first + 1;
-	      for (difference_type job_counter = 0; job_counter < chunk_size && current_job <= my_job.last; job_counter++)
+                for (difference_type job_counter = 0;
+                     job_counter < chunk_size && current_job <= my_job.last;
+                     job_counter++)
                  {
                    // Yes: process it!
                    current = begin + current_job;
@@ -214,15 +236,14 @@ namespace __gnu_parallel
                    result = r(result, f(op, current));
                  }

-#pragma omp flush(busy)
-
+#               pragma omp flush(busy)
              }

            // After reaching this point, a thread's job list is empty.
            if (iam_working)
              {
-#pragma omp atomic
                // This thread no longer has work.
+#               pragma omp atomic
                busy--;

                iam_working = false;
@@ -231,16 +252,17 @@ namespace __gnu_parallel
            difference_type supposed_first, supposed_last, supposed_load;
            do
              {
-	      // Find random nonempty deque (not own) and do consistency check.
+                // Find random nonempty deque (not own), do consistency check.
                yield();
-#pragma omp flush(busy)
+#               pragma omp flush(busy)
                victim = rand_gen();
                supposed_first = job[victim * stride].first;
                supposed_last = job[victim * stride].last;
                supposed_load = job[victim * stride].load;
              }
            while (busy > 0
-		 && ((supposed_load <= 0) || ((supposed_first + supposed_load - 1) != supposed_last)));
+              && ((supposed_load <= 0)
+                || ((supposed_first + supposed_load - 1) != supposed_last)));

            if (busy == 0)
              break;
@@ -251,40 +273,30 @@ namespace __gnu_parallel
                // Number of elements to steal (at least one).
                steal = (supposed_load < 2) ? 1 : supposed_load / 2;

-	      // Protects against stealing threads
-	      // omp_set_lock(&(job[victim * stride].lock));
-
                // Push victim's start forward.
-	      difference_type stolen_first = fetch_and_add<difference_type>(&(job[victim * stride].first), steal);
-	      difference_type stolen_try = stolen_first + steal - difference_type(1);
-
-	      // Protects against working thread
-	      // omp_unset_lock(&(job[victim * stride].lock));
+                difference_type stolen_first =
+                    fetch_and_add<difference_type>(
+                        &(job[victim * stride].first), steal);
+                difference_type stolen_try =
+                    stolen_first + steal - difference_type(1);

                my_job.first = stolen_first;
-	      
-	      // Avoid std::min dependencies.
-	      my_job.last = stolen_try < supposed_last ? stolen_try : supposed_last;
-
+                my_job.last = __gnu_parallel::min(stolen_try, supposed_last);
                my_job.load = my_job.last - my_job.first + 1;

-	      //omp_unset_lock(&(my_job.lock));
-
-#pragma omp atomic
                // Has potential work again.
+#               pragma omp atomic
                  busy++;
                iam_working = true;

-#pragma omp flush(busy)
+#               pragma omp flush(busy)
              }
-#pragma omp flush(busy)
+#           pragma omp flush(busy)
          } // end while busy > 0
            // Add accumulated result to output.
        omp_set_lock(&output_lock);
        output = r(output, result);
        omp_unset_lock(&output_lock);
-
-      //omp_destroy_lock(&(my_job.lock));
      }

    delete[] job;