diff --git a/CMakeLists.txt b/CMakeLists.txt
index 86bd6f78cf4ba6d419ac0242d088aa54cca6a8cc..dfec55acac34c674feffdd5d7dad7e3768bd12db 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,12 +31,25 @@ if(WITH_TRACE)
     add_definitions("-DWITH_TRACE")
 endif()
 
-# TBB support
-set(WITH_TBB OFF CACHE BOOL "use TBB for on-node threading" )
-if(WITH_TBB)
+#threading model selection
+set(THREADING_MODEL "serial" CACHE STRING "set the threading model, one of serial/tbb/omp")
+if(THREADING_MODEL MATCHES "tbb")
+    # TBB support
     find_package(TBB REQUIRED)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TBB_DEFINITIONS}")
     add_definitions(-DWITH_TBB)
+
+elseif(THREADING_MODEL MATCHES "omp")
+    # OpenMP support
+    find_package(OpenMP REQUIRED)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    add_definitions(-DWITH_OMP)
+
+elseif(THREADING_MODEL MATCHES "serial")
+    #setup previously done
+   
+else()
+    message( FATAL_ERROR "-- Threading model '${THREADING_MODEL}' not supported, use one of serial/tbb/omp")
 endif()
 
 # MPI support
@@ -50,12 +63,15 @@ if(WITH_MPI)
     set_property(DIRECTORY APPEND_STRING PROPERTY COMPILE_OPTIONS "${MPI_C_COMPILE_FLAGS}")
 endif()
 
-# Profiler support
+
+# Internal profiler support
 set(WITH_PROFILING OFF CACHE BOOL "use built-in profiling of miniapp" )
 if(WITH_PROFILING)
     add_definitions(-DWITH_PROFILING)
 endif()
 
+
+
 # Cray systems
 set(SYSTEM_CRAY OFF CACHE BOOL "add flags for compilation on Cray systems")
 if(SYSTEM_CRAY)
diff --git a/scripts/extrae/extrae.xml b/scripts/extrae/extrae.xml
new file mode 100644
index 0000000000000000000000000000000000000000..fb3e731198810371fc48bb006c87012fe84df14e
--- /dev/null
+++ b/scripts/extrae/extrae.xml
@@ -0,0 +1,81 @@
+<?xml version='1.0'?>
+
+<trace enabled="yes"
+ home="/apps/CEPBATOOLS/extrae/3.3.0/openmpi+libgomp4.2/64"
+ initial-mode="detail"
+ type="paraver"
+ xml-parser-id="Id: xml-parse.c 3918 2016-03-11 14:59:01Z harald $"
+>
+
+  <openmp enabled="yes">
+    <locks enabled="no" />
+    <counters enabled="yes" />
+  </openmp>
+
+  <pthread enabled="no">
+    <locks enabled="no" />
+    <counters enabled="yes" />
+  </pthread>
+
+  <counters enabled="yes">
+    <cpu enabled="yes" starting-set-distribution="1">
+      <set enabled="yes" domain="all" changeat-time="0">
+        PAPI_TOT_INS,PAPI_TOT_CYC,PAPI_L1_DCM,PAPI_L2_DCM,PAPI_L3_TCM,PAPI_FP_INS,PAPI_BR_MSP
+      </set>
+      <set enabled="yes" domain="all" changeat-time="0">
+        PAPI_TOT_INS,PAPI_TOT_CYC,PAPI_LD_INS,PAPI_SR_INS,PAPI_BR_UCN,PAPI_BR_CN,PAPI_VEC_SP,RESOURCE_STALLS
+        <sampling enabled="no" period="1000000000">PAPI_TOT_CYC</sampling>
+      </set>
+    </cpu>
+
+    <network enabled="no" />
+
+    <resource-usage enabled="no" />
+
+    <memory-usage enabled="no" />
+  </counters>
+
+  <storage enabled="no">
+    <trace-prefix enabled="yes">TRACE</trace-prefix>
+    <size enabled="no">5</size>
+    <temporal-directory enabled="yes">/scratch</temporal-directory>
+    <final-directory enabled="yes">/gpfs/scratch/bsc41/bsc41273</final-directory>
+  </storage>
+
+  <buffer enabled="yes">
+    <size enabled="yes">500000</size>
+    <circular enabled="no" />
+  </buffer>
+
+  <trace-control enabled="yes">
+    <file enabled="no" frequency="5M">/gpfs/scratch/bsc41/bsc41273/control</file>
+    <global-ops enabled="no"></global-ops>
+  </trace-control>
+
+  <others enabled="yes">
+    <minimum-time enabled="no">10M</minimum-time>
+    <finalize-on-signal enabled="yes" 
+      SIGUSR1="no" SIGUSR2="no" SIGINT="yes"
+      SIGQUIT="yes" SIGTERM="yes" SIGXCPU="yes"
+      SIGFPE="yes" SIGSEGV="yes" SIGABRT="yes"
+    />
+    <flush-sampling-buffer-at-instrumentation-point enabled="yes" />
+  </others>
+
+  <sampling enabled="no" type="virtual" period="50m" variability="10m" />
+
+  <dynamic-memory enabled="no" />
+
+  <input-output enabled="no" />
+
+  <merge enabled="yes" 
+    synchronization="default"
+    tree-fan-out="16"
+    max-memory="512"
+    joint-states="yes"
+    keep-mpits="yes"
+    sort-addresses="yes"
+    overwrite="yes"
+  />
+
+</trace>
diff --git a/scripts/extrae/run_instrumented_omp.sh b/scripts/extrae/run_instrumented_omp.sh
new file mode 100755
index 0000000000000000000000000000000000000000..7e0601a11f1d3463c4c62064a940c0874862a0da
--- /dev/null
+++ b/scripts/extrae/run_instrumented_omp.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+display_usage() { 
+	echo -e "\nUsage:\n$0 [num_threads] \n" 
+} 
+
+# if less than one arguments supplied, display usage 
+if [  $# -le 0 ] 
+	then 
+		display_usage
+		exit 1
+fi 
+ 
+# check whether user had supplied -h or --help . If yes display usage 
+if [[ ( $# == "--help") ||  $# == "-h" ]] 
+	then 
+		display_usage
+		exit 0
+fi
+
+if [ -z ${EXTRAE_HOME+x} ] 
+	then 
+		echo -e "\nSpecify EXTRAE_HOME variable before executing this script\n"
+		echo "	export EXTRAE_HOME=path/to/directory"
+		exit 1
+	else 
+		echo -e "\nEXTRAE_HOME is set to '$EXTRAE_HOME'"
+fi
+
+export OMP_NUM_THREADS=$1
+export EXTRAE_CONFIG_FILE=extrae.xml
+source ${EXTRAE_HOME}/etc/extrae.sh
+export LD_PRELOAD=$(find $EXTRAE_HOME -name "libomptrace.so")
+
+
+./../../build/miniapp/miniapp.exe 
+
+unset LD_PRELOAD
\ No newline at end of file
diff --git a/scripts/extrae/run_instrumented_serial.sh b/scripts/extrae/run_instrumented_serial.sh
new file mode 100755
index 0000000000000000000000000000000000000000..f8787e8d7871ff9d879239490be7ce1e174d6afd
--- /dev/null
+++ b/scripts/extrae/run_instrumented_serial.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+if [ -z ${EXTRAE_HOME+x} ] 
+	then 
+		echo -e "\nSpecify EXTRAE_HOME variable before executing this script\n"
+		echo "	export EXTRAE_HOME=path/to/directory"
+		exit 1
+	else 
+		echo -e "\nEXTRAE_HOME is set to '$EXTRAE_HOME'"
+fi
+
+
+export EXTRAE_CONFIG_FILE=extrae.xml
+source ${EXTRAE_HOME}/etc/extrae.sh
+export LD_PRELOAD=$(find $EXTRAE_HOME -name "libseqtrace.so")
+
+./../../build/miniapp/miniapp.exe 
+
+unset LD_PRELOAD
\ No newline at end of file
diff --git a/scripts/extrae/run_once.sh b/scripts/extrae/run_once.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b863841883a96f34efc44eb9c3d998955c14b563
--- /dev/null
+++ b/scripts/extrae/run_once.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+# @ job_name = miniapp
+# @ partition = debug
+## @ reservation = 
+# @ initialdir = .
+# @ output = miniapp_%j.out
+# @ error = miniapp_%j.err
+# @ total_tasks = 1
+# @ cpus_per_task = 12
+# @ node_usage = not_shared
+# @ wall_clock_limit = 00:15:00
+
+export OMP_NUM_THREADS=16
+./run_instrumented.sh 
+
diff --git a/src/connection.hpp b/src/connection.hpp
index b9e40fb5baeb83ad69c87c219f65d9f98acc4456..361aee20646113fac75bcde3139fad1026e427a9 100644
--- a/src/connection.hpp
+++ b/src/connection.hpp
@@ -15,6 +15,8 @@ public:
     using id_type = cell_member_type;
     using time_type = Time;
 
+    connection()=default;
+
     connection(id_type src, id_type dest, float w, time_type d) :
         source_(src),
         destination_(dest),
diff --git a/src/threading/omp.hpp b/src/threading/omp.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ad43e82b89f0ed8fd63a731da71cefe6c22c2c0e
--- /dev/null
+++ b/src/threading/omp.hpp
@@ -0,0 +1,163 @@
+#pragma once
+
+#if !defined(WITH_OMP)
+    #error "this header can only be loaded if WITH_OMP is set"
+#endif
+
+#include <omp.h>
+#include "parallel_stable_sort.h"
+
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <string>
+#include <vector>
+
+namespace nest {
+namespace mc {
+namespace threading {
+
+///////////////////////////////////////////////////////////////////////
+// types
+///////////////////////////////////////////////////////////////////////
+template <typename T>
+class enumerable_thread_specific {
+    using storage_class = std::vector<T>;
+    storage_class data;
+public :
+    using iterator = typename storage_class::iterator;
+    using const_iterator = typename storage_class::const_iterator;
+
+    enumerable_thread_specific() {
+        data = std::vector<T>(omp_get_max_threads());
+    }
+
+    enumerable_thread_specific(const T& init) {
+        data = std::vector<T>(omp_get_max_threads(), init);
+    }
+
+    T& local() { return data[omp_get_thread_num()]; }
+    const T& local() const { return data[omp_get_thread_num()]; }
+
+    auto size() -> decltype(data.size()) const { return data.size(); }
+
+    iterator begin() { return data.begin(); }
+    iterator end()   { return data.end(); }
+
+    const_iterator begin() const { return data.begin(); }
+    const_iterator end()   const { return data.end(); }
+
+    const_iterator cbegin() const { return data.cbegin(); }
+    const_iterator cend()   const { return data.cend(); }
+};
+
+///////////////////////////////////////////////////////////////////////
+// algorithms
+///////////////////////////////////////////////////////////////////////
+struct parallel_for {
+    template <typename F>
+    static void apply(int left, int right, F f) {
+        #pragma omp parallel for
+        for(int i=left; i<right; ++i) {
+            f(i);
+        }
+    }
+};
+
+template <typename RandomIt>
+void sort(RandomIt begin, RandomIt end) {
+    pss::parallel_stable_sort(begin, end);
+}
+
+template <typename RandomIt, typename Compare>
+void sort(RandomIt begin, RandomIt end, Compare comp) {
+    pss::parallel_stable_sort(begin, end ,comp);
+}
+
+template <typename Container>
+void sort(Container& c) {
+    pss::parallel_stable_sort(c.begin(), c.end());
+}
+
+
+template <typename T>
+class parallel_vector {
+    using value_type = T;
+    std::vector<value_type> data_;
+public:
+    parallel_vector() = default;
+    using iterator = typename std::vector<value_type>::iterator;
+    using const_iterator = typename std::vector<value_type>::const_iterator;
+
+    iterator begin() { return data_.begin(); }
+    iterator end()   { return data_.end(); }
+
+    const_iterator begin() const { return data_.begin(); }
+    const_iterator end()   const { return data_.end(); }
+
+    const_iterator cbegin() const { return data_.cbegin(); }
+    const_iterator cend()   const { return data_.cend(); }
+
+    void push_back (const value_type& val) {
+        #pragma omp critical
+        data_.push_back(val);
+    }
+
+    void push_back (value_type&& val) {
+        #pragma omp critical
+        data_.push_back(std::move(val));
+    }
+};
+
+inline std::string description() {
+    return "OpenMP";
+}
+
+struct timer {
+    using time_point = std::chrono::time_point<std::chrono::system_clock>;
+
+    static inline time_point tic() {
+        return std::chrono::system_clock::now();
+    }
+
+    static inline double toc(time_point t) {
+        return std::chrono::duration<double>(tic() - t).count();
+    }
+
+    static inline double difference(time_point b, time_point e) {
+        return std::chrono::duration<double>(e-b).count();
+    }
+};
+
+constexpr bool multithreaded() { return true; }
+
+
+class task_group {
+public:
+    task_group() = default;
+
+    template<typename Func>
+    void run(const Func& f) {
+        f();
+    }
+
+    template<typename Func>
+    void run_and_wait(const Func& f) {
+        f();
+    }
+
+    void wait()
+    {}
+
+    bool is_canceling() {
+        return false;
+    }
+
+    void cancel()
+    {}
+};
+
+} // threading
+} // mc
+} // nest
+
diff --git a/src/threading/parallel_stable_sort.h b/src/threading/parallel_stable_sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..caf010dd25c7fa08fb3670250d1062fac2f92c85
--- /dev/null
+++ b/src/threading/parallel_stable_sort.h
@@ -0,0 +1,127 @@
+/*
+  Copyright (C) 2014 Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+  * Neither the name of Intel Corporation nor the names of its
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <algorithm>
+#include <omp.h>
+
+#include "pss_common.h"
+
+namespace pss {
+
+namespace internal {
+
+// Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,zs+(xe-xs)+(ye-ys))
+// Destroy input sequence iff destroy==true
+template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
+#if __INTEL_COMPILER<=1500
+// Work around bug where firstprivate applied to formal parameter does not work.
+void parallel_move_merge( RandomAccessIterator1 xs_, RandomAccessIterator1 xe, RandomAccessIterator2 ys_, RandomAccessIterator2 ye, RandomAccessIterator3 zs_, bool destroy, Compare comp ) {
+    RandomAccessIterator1 xs = xs_;
+    RandomAccessIterator2 ys = ys_;
+    RandomAccessIterator3 zs = zs_;
+#else
+void parallel_move_merge( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp ) {
+#endif
+    const int MERGE_CUT_OFF = 2000;
+    while( (xe-xs) + (ye-ys) > MERGE_CUT_OFF ) {
+        RandomAccessIterator1 xm;
+        RandomAccessIterator2 ym;
+        if( xe-xs < ye-ys  ) {
+            ym = ys+(ye-ys)/2;
+            xm = std::upper_bound(xs,xe,*ym,comp);
+        } else {
+            xm = xs+(xe-xs)/2;
+            ym = std::lower_bound(ys,ye,*xm,comp);
+        }
+#pragma omp task untied mergeable firstprivate(xs,xm,ys,ym,zs,destroy,comp)
+        parallel_move_merge( xs, xm, ys, ym, zs, destroy, comp );
+        zs += (xm-xs) + (ym-ys);
+        xs = xm;
+        ys = ym;
+    }
+    serial_move_merge( xs, xe, ys, ye, zs, comp );
+    if( destroy ) {
+        serial_destroy( xs, xe );
+        serial_destroy( ys, ye );
+    }
+#pragma omp taskwait
+}
+
+// Sorts [xs,xe), where zs[0:xe-xs) is temporary buffer supplied by caller.
+// Result is in [xs,xe) if inplace==true, otherwise in [zs,zs+(xe-xs))
+template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
+void parallel_stable_sort_aux( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp ) {
+    //typedef typename std::iterator_traits<RandomAccessIterator2>::value_type T;
+    const int SORT_CUT_OFF = 500;
+    if( xe-xs<=SORT_CUT_OFF ) {
+        stable_sort_base_case(xs, xe, zs, inplace, comp); 
+    } else {
+        RandomAccessIterator1 xm = xs + (xe-xs)/2;
+        RandomAccessIterator2 zm = zs + (xm-xs);
+        RandomAccessIterator2 ze = zs + (xe-xs);
+#pragma omp task
+        parallel_stable_sort_aux( xs, xm, zs, !inplace, comp );
+        parallel_stable_sort_aux( xm, xe, zm, !inplace, comp );
+#pragma omp taskwait
+        if( inplace )
+            parallel_move_merge( zs, zm, zm, ze, xs, inplace==2, comp );
+        else
+            parallel_move_merge( xs, xm, xm, xe, zs, false, comp );
+   }
+}
+
+} // namespace internal
+
+
+
+template<typename RandomAccessIterator, typename Compare>
+void parallel_stable_sort( RandomAccessIterator xs, RandomAccessIterator xe, Compare comp ) {
+    typedef typename std::iterator_traits<RandomAccessIterator>::value_type T;
+    if( internal::raw_buffer z = internal::raw_buffer( sizeof(T)*(xe-xs) ) )
+        if( omp_get_num_threads() > 1 ) 
+            internal::parallel_stable_sort_aux( xs, xe, (T*)z.get(), 2, comp );
+        else
+            #pragma omp parallel
+            #pragma omp master
+           internal::parallel_stable_sort_aux( xs, xe, (T*)z.get(), 2, comp );  
+    else
+        // Not enough memory available - fall back on serial sort
+        std::stable_sort( xs, xe, comp );
+}
+
+template<class RandomAccessIterator>
+void parallel_stable_sort( RandomAccessIterator xs, RandomAccessIterator xe ) {
+    typedef typename std::iterator_traits<RandomAccessIterator>::value_type T;
+    parallel_stable_sort( xs, xe, std::less<T>() );
+}
+
+
+} // namespace pss
diff --git a/src/threading/pss_common.h b/src/threading/pss_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..d49f6dbf41777dadad91de7be19ad41894f5374a
--- /dev/null
+++ b/src/threading/pss_common.h
@@ -0,0 +1,110 @@
+/*
+  Copyright (C) 2014 Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+  * Neither the name of Intel Corporation nor the names of its
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <utility>
+#include <iterator>
+#include <algorithm>
+
+namespace pss {
+
+namespace internal {
+
+//! Destroy sequence [xs,xe)
+template<class RandomAccessIterator>
+void serial_destroy( RandomAccessIterator zs, RandomAccessIterator ze ) {
+    typedef typename std::iterator_traits<RandomAccessIterator>::value_type T;
+    while( zs!=ze ) {
+        --ze;
+        (*ze).~T();
+    }
+}
+
+//! Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,(xe-xs)+(ye-ys)), using std::move
+template<class RandomAccessIterator1, class RandomAccessIterator2, class RandomAccessIterator3, class Compare>
+void serial_move_merge( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, Compare comp ) {
+    if( xs!=xe ) {
+        if( ys!=ye )
+            for(;;)
+                if( comp(*ys,*xs) ) {
+                    *zs = std::move(*ys);
+                    ++zs;
+                    if( ++ys==ye ) break;
+                } else {
+                    *zs = std::move(*xs);
+                    ++zs;
+                    if( ++xs==xe ) goto movey;
+                }
+        ys = xs;
+        ye = xe;
+    }
+movey:
+    std::move( ys, ye, zs );
+}
+
+template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
+void stable_sort_base_case(  RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp) {
+    std::sort( xs, xe, comp );
+    if( inplace!=2 ) {
+        RandomAccessIterator2 ze = zs + (xe-xs);
+        typedef typename std::iterator_traits<RandomAccessIterator2>::value_type T;
+        if( inplace )
+            // Initialize the temporary buffer
+            for( ; zs<ze; ++zs )
+                new(&*zs) T;
+        else
+            // Initialize the temporary buffer and move keys to it.
+            for( ; zs<ze; ++xs, ++zs )
+                new(&*zs) T(std::move(*xs));
+    }
+}
+
+//! Raw memory buffer with automatic cleanup.
+class raw_buffer {
+    void* ptr;
+public:
+    //! Try to obtain buffer of given size.
+    raw_buffer( size_t bytes ) : ptr( operator new(bytes,std::nothrow) ) {}
+    //! True if buffer was successfully obtained, zero otherwise.
+    operator bool() const {return ptr;}
+    //! Return pointer to buffer, or  NULL if buffer could not be obtained.
+    void* get() const {return ptr;}
+    //! Destroy buffer
+    ~raw_buffer() {operator delete(ptr);}
+};
+
+} // namespace internal
+
+
+/*template<typename T>
+void parallel_stable_sort(typename std::iterator<T> i,typename std::iterator<T> j){
+  parallel_stable_sort( i, j, std::less<T>() );
+}*/
+} // namespace pss
diff --git a/src/threading/threading.hpp b/src/threading/threading.hpp
index 76431d7de8cb85aebcf18ce1988ab7f6f2ea118f..26b2bace304ac14195d940f1eccf4cf247c43043 100644
--- a/src/threading/threading.hpp
+++ b/src/threading/threading.hpp
@@ -2,6 +2,8 @@
 
 #if defined(WITH_TBB)
     #include "tbb.hpp"
+#elif defined(WITH_OMP)
+    #include "omp.hpp"
 #else
     #define WITH_SERIAL
     #include "serial.hpp"
diff --git a/tests/unit/test_spike_store.cpp b/tests/unit/test_spike_store.cpp
index dd075f878bfccc7a7a07887e6736fcb2768e0237..cf5e529005c33f944bfe7506d17ae4467c969501 100644
--- a/tests/unit/test_spike_store.cpp
+++ b/tests/unit/test_spike_store.cpp
@@ -1,5 +1,6 @@
 #include "gtest.h"
 
+#include <threading/threading.hpp>
 #include <thread_private_spike_store.hpp>
 
 TEST(spike_store, insert)