From 0ded25a68266f691497311e3052d5f5fdd5a6686 Mon Sep 17 00:00:00 2001
From: Ivan Martinez <ivan.martinez@bsc.es>
Date: Fri, 4 Nov 2016 12:00:08 +0100
Subject: [PATCH] Omp (#38)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* first version of openmp threading back end
* adding openmp parallel sort implementation
* OpenMP sort working
* Support for units syntax within state block.
* Add soma-less cable cell to test cells.
Also:
* Ensure intrinsic and passive properties properly set on test cells.
* Change bulk resistivity default.
* Align defaults with values used in most of the NEURON
validation scripts.
* Use consistent 100 Ω·m bulk resistivity across both
NEURON test models and basic validation cells.
* OpenMP back end working
* Add Extrae+paraver support, needs to fix compilation warnings
* Reorganize validation data generation
* Move generation and data to top-level validation directory.
* Make BUILD_VALIDATION_DATA and VALIDATION_DATA_DIR cache vars.
* Add helper CMake functions for data generation.
Note `validation/ref/numeric/foo.sh` is just a placeholder.
* Bugfix: hh_soma.jl
* Use consistent scaling for y[1] scalar voltage in hh_soma.jl
* Also: add more reserved target names to CMakeLists.txt
helper function.
* Refactor convergence tests; add numeric soma ref.
* Amend data directory path in validation tests.
* Enmodulate `hh_soma.jl`
* Add HH channel reference data generations script.
* Switch `validate_soma.cpp` to numeric reference data.
* Consolidate common code in `validate_ball_and_stick.cpp`
* Add (nearly) Rallpack1 validation test (see below).
* Gentle failure on absence of reference data in
`validate_ball_and_stick.cpp`
Can't yet override mechanism default parameter values,
so the cable cell model added to `test_common_cells.hpp`
lets the default stand; validation script will have
to use the default membrane conductance rather than that
given by Rallpack1.
* Add Rallpack1 validation, plus bugfix, clean
* Implement Rallpack1 validation test (with a workaround
for inability to set membrane conductance).
* Fix bug in L≠1 case in PassiveCable.jl (this may still be
wrong).
* Fix bug in peak delta computation in trace analysis when
both traces have no local maxima.
* Gentle failure on missing `numeric_soma.json`
* Allow multiple `-s` selection operations for `tsplot`,
acting disjunctively.
* Remove errant test file.
* file's cleanup
* Remove tabs
* Use correct routine in numeric_rallpack1.jl x0.3
* Configure-time test for julia
* `math::infinity<>()` wrapper for infinity
* Use name `i_e` for Stim current density
* Use `math::infinity<>()` for infinite value
* Adds unit tests for the STATE block.
* Add "lib" to search prefixes for libtbb
* Fix quoting error in library search.
* Add "lib" to prefixes when system is "Linux".
* Address deprecated use of 'symbol' warning.
Julia 0.5 deprecates use of `symbol` instead of
`Symbol`. This patch just substitutes the
correct call.
* Address deprecated use of 'symbol' warning.
Julia 0.5 deprecates use of `symbol` instead of
`Symbol`. This patch just substitutes the
correct call.
* Addresses PR comments.
* Unit tests for math.hpp
* Tests for `math::pi`, `math::lerp`, `math::area_frustrum`
and `math::volume_frustrum`
* Fix `math:pi<long double>()`.
* Extend range, view functionality.
* New `filter` view: lazily selects based on predicate.
* Generic `front` and `back` for sequences.
* New rangeutil STL wrappers `stable_sort_by`, `all_of`, `any_of`.
* Consolidate common utility unit testing structures into
`tests/unit/common.hpp`
* Add `ball_and_squiggle` model; fix `ball_and_taper`.
* Make `test_common_cells.hpp` and `ball_and_taper.py` agree.
* Add `ball_and_squiggle` model that has a tapering undulating
profile.
* Address PR#46 review comments.
* Add documentation of template parameters for `filter_iterator`.
* Document use of `uninitalized<F>` for holding functional objects
in `filter_iterator` and `transform_iterator`
* Consolidate validation test code (issue #41)
* Simplify trace analysis and reporting code in
`trace_analysis.hpp`
* Consolidate convergence test run procedures into
new class `convergence_test_runner`.
* New compartment info structure for FVM.
* Make `algorithm::sum`, `algorithm::mean` more generic,
allowing use with array types.
* Add `div_compartment` compartment representation, that
holds geometric information for each half of a compartment
that will then be used in calculating control volumes.
* Add three compartmentalisation schemes/policies that
discretize a segment into `div_compartment` objects:
* `div_compartment_by_ends` divides based only on the
segment end points and radii.
* `div_compartment_sampler` forms frusta by sampling
the segment radius at each compartment boundary
* `div_compartment_integrator` computes the compartment
areas and volumes exactly by summing all frustra
in the intersection of the segment and the compartmnet
span.
* Extrae linked at execution time
* cleaning project
* Complex compartments
* Use divided compartments to determine FVM coefficients.
* Pick correct control volume in FVM from sgement position (avoids
off-by-half error.)
* Add colour override functionality to tsplot: `--colour` option.
* Add const accessor for cell soma.
* Source formatting, comments in `math.hpp`
* Fix `range_view`: was using incorrectly named type trait.
* Add unit test for `range_view`.
* Allow points of discontinuity to be omitted from L-infinity norm
calculations.
* Add `-d, --min-dt` option to `validate.exe` to control time
step in validation convergence tests.
* Add validation test: confirm divided compartment policy does
not effect results on simple frustrum dendrites.
* Change default max compartments on validation tests to 100
(ad hoc observed convergence limit at dt circa 0.001 ms;
finder spatial division would required much finer dt.)
* Make NEURON validation data generation scripts use CVODE by
default, and with `secondorder=2` when non-zero `dt` is given.
* Remove division policy type parameter.
* Use only `div_compartment_integrator` for compartmentalization in
`fvm_multicell`. The policy will later be moved to a backend
policy class.
* For now, disable validation tests that test different division
policies (see above).
* Tweak comments and remove redundant `using`, following comments
on PR#54.
* Minor twicks and corrections
---
CMakeLists.txt | 24 +++-
scripts/extrae/extrae.xml | 81 +++++++++++
scripts/extrae/run_instrumented_omp.sh | 38 +++++
scripts/extrae/run_instrumented_serial.sh | 19 +++
scripts/extrae/run_once.sh | 15 ++
src/connection.hpp | 2 +
src/threading/omp.hpp | 163 ++++++++++++++++++++++
src/threading/parallel_stable_sort.h | 127 +++++++++++++++++
src/threading/pss_common.h | 110 +++++++++++++++
src/threading/threading.hpp | 2 +
tests/unit/test_spike_store.cpp | 1 +
11 files changed, 578 insertions(+), 4 deletions(-)
create mode 100644 scripts/extrae/extrae.xml
create mode 100755 scripts/extrae/run_instrumented_omp.sh
create mode 100755 scripts/extrae/run_instrumented_serial.sh
create mode 100755 scripts/extrae/run_once.sh
create mode 100644 src/threading/omp.hpp
create mode 100644 src/threading/parallel_stable_sort.h
create mode 100644 src/threading/pss_common.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 86bd6f78..dfec55ac 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,12 +31,25 @@ if(WITH_TRACE)
add_definitions("-DWITH_TRACE")
endif()
-# TBB support
-set(WITH_TBB OFF CACHE BOOL "use TBB for on-node threading" )
-if(WITH_TBB)
+#threading model selection
+set(THREADING_MODEL "serial" CACHE STRING "set the threading model, one of serial/tbb/omp")
+if(THREADING_MODEL MATCHES "tbb")
+ # TBB support
find_package(TBB REQUIRED)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TBB_DEFINITIONS}")
add_definitions(-DWITH_TBB)
+
+elseif(THREADING_MODEL MATCHES "omp")
+ # OpenMP support
+ find_package(OpenMP REQUIRED)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+ add_definitions(-DWITH_OMP)
+
+elseif(THREADING_MODEL MATCHES "serial")
+ #setup previously done
+
+else()
+ message( FATAL_ERROR "-- Threading model '${THREADING_MODEL}' not supported, use one of serial/tbb/omp")
endif()
# MPI support
@@ -50,12 +63,15 @@ if(WITH_MPI)
set_property(DIRECTORY APPEND_STRING PROPERTY COMPILE_OPTIONS "${MPI_C_COMPILE_FLAGS}")
endif()
-# Profiler support
+
+# Internal profiler support
set(WITH_PROFILING OFF CACHE BOOL "use built-in profiling of miniapp" )
if(WITH_PROFILING)
add_definitions(-DWITH_PROFILING)
endif()
+
+
# Cray systems
set(SYSTEM_CRAY OFF CACHE BOOL "add flags for compilation on Cray systems")
if(SYSTEM_CRAY)
diff --git a/scripts/extrae/extrae.xml b/scripts/extrae/extrae.xml
new file mode 100644
index 00000000..fb3e7311
--- /dev/null
+++ b/scripts/extrae/extrae.xml
@@ -0,0 +1,81 @@
+<?xml version='1.0'?>
+
+<trace enabled="yes"
+ home="/apps/CEPBATOOLS/extrae/3.3.0/openmpi+libgomp4.2/64"
+ initial-mode="detail"
+ type="paraver"
+ xml-parser-id="Id: xml-parse.c 3918 2016-03-11 14:59:01Z harald $"
+>
+
+ <openmp enabled="yes">
+ <locks enabled="no" />
+ <counters enabled="yes" />
+ </openmp>
+
+ <pthread enabled="no">
+ <locks enabled="no" />
+ <counters enabled="yes" />
+ </pthread>
+
+ <counters enabled="yes">
+ <cpu enabled="yes" starting-set-distribution="1">
+ <set enabled="yes" domain="all" changeat-time="0">
+ PAPI_TOT_INS,PAPI_TOT_CYC,PAPI_L1_DCM,PAPI_L2_DCM,PAPI_L3_TCM,PAPI_FP_INS,PAPI_BR_MSP
+ </set>
+ <set enabled="yes" domain="all" changeat-time="0">
+ PAPI_TOT_INS,PAPI_TOT_CYC,PAPI_LD_INS,PAPI_SR_INS,PAPI_BR_UCN,PAPI_BR_CN,PAPI_VEC_SP,RESOURCE_STALLS
+ <sampling enabled="no" period="1000000000">PAPI_TOT_CYC</sampling>
+ </set>
+ </cpu>
+
+ <network enabled="no" />
+
+ <resource-usage enabled="no" />
+
+ <memory-usage enabled="no" />
+ </counters>
+
+ <storage enabled="no">
+ <trace-prefix enabled="yes">TRACE</trace-prefix>
+ <size enabled="no">5</size>
+ <temporal-directory enabled="yes">/scratch</temporal-directory>
+ <final-directory enabled="yes">/gpfs/scratch/bsc41/bsc41273</final-directory>
+ </storage>
+
+ <buffer enabled="yes">
+ <size enabled="yes">500000</size>
+ <circular enabled="no" />
+ </buffer>
+
+ <trace-control enabled="yes">
+ <file enabled="no" frequency="5M">/gpfs/scratch/bsc41/bsc41273/control</file>
+ <global-ops enabled="no"></global-ops>
+ </trace-control>
+
+ <others enabled="yes">
+ <minimum-time enabled="no">10M</minimum-time>
+ <finalize-on-signal enabled="yes"
+ SIGUSR1="no" SIGUSR2="no" SIGINT="yes"
+ SIGQUIT="yes" SIGTERM="yes" SIGXCPU="yes"
+ SIGFPE="yes" SIGSEGV="yes" SIGABRT="yes"
+ />
+ <flush-sampling-buffer-at-instrumentation-point enabled="yes" />
+ </others>
+
+ <sampling enabled="no" type="virtual" period="50m" variability="10m" />
+
+ <dynamic-memory enabled="no" />
+
+ <input-output enabled="no" />
+
+ <merge enabled="yes"
+ synchronization="default"
+ tree-fan-out="16"
+ max-memory="512"
+ joint-states="yes"
+ keep-mpits="yes"
+ sort-addresses="yes"
+ overwrite="yes"
+ />
+
+</trace>
diff --git a/scripts/extrae/run_instrumented_omp.sh b/scripts/extrae/run_instrumented_omp.sh
new file mode 100755
index 00000000..7e0601a1
--- /dev/null
+++ b/scripts/extrae/run_instrumented_omp.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+display_usage() {
+ echo -e "\nUsage:\n$0 [num_threads] \n"
+}
+
+# if less than one arguments supplied, display usage
+if [ $# -le 0 ]
+ then
+ display_usage
+ exit 1
+fi
+
+# check whether user had supplied -h or --help . If yes display usage
+if [[ ( $# == "--help") || $# == "-h" ]]
+ then
+ display_usage
+ exit 0
+fi
+
+if [ -z ${EXTRAE_HOME+x} ]
+ then
+ echo -e "\nSpecify EXTRAE_HOME variable before executing this script\n"
+ echo " export EXTRAE_HOME=path/to/directory"
+ exit 1
+ else
+ echo -e "\nEXTRAE_HOME is set to '$EXTRAE_HOME'"
+fi
+
+export OMP_NUM_THREADS=$1
+export EXTRAE_CONFIG_FILE=extrae.xml
+source ${EXTRAE_HOME}/etc/extrae.sh
+export LD_PRELOAD=$(find $EXTRAE_HOME -name "libomptrace.so")
+
+
+./../../build/miniapp/miniapp.exe
+
+unset LD_PRELOAD
\ No newline at end of file
diff --git a/scripts/extrae/run_instrumented_serial.sh b/scripts/extrae/run_instrumented_serial.sh
new file mode 100755
index 00000000..f8787e8d
--- /dev/null
+++ b/scripts/extrae/run_instrumented_serial.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+if [ -z ${EXTRAE_HOME+x} ]
+ then
+ echo -e "\nSpecify EXTRAE_HOME variable before executing this script\n"
+ echo " export EXTRAE_HOME=path/to/directory"
+ exit 1
+ else
+ echo -e "\nEXTRAE_HOME is set to '$EXTRAE_HOME'"
+fi
+
+
+export EXTRAE_CONFIG_FILE=extrae.xml
+source ${EXTRAE_HOME}/etc/extrae.sh
+export LD_PRELOAD=$(find $EXTRAE_HOME -name "libseqtrace.so")
+
+./../../build/miniapp/miniapp.exe
+
+unset LD_PRELOAD
\ No newline at end of file
diff --git a/scripts/extrae/run_once.sh b/scripts/extrae/run_once.sh
new file mode 100755
index 00000000..b8638418
--- /dev/null
+++ b/scripts/extrae/run_once.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+# @ job_name = miniapp
+# @ partition = debug
+## @ reservation =
+# @ initialdir = .
+# @ output = miniapp_%j.out
+# @ error = miniapp_%j.err
+# @ total_tasks = 1
+# @ cpus_per_task = 12
+# @ node_usage = not_shared
+# @ wall_clock_limit = 00:15:00
+
+export OMP_NUM_THREADS=16
+./run_instrumented.sh
+
diff --git a/src/connection.hpp b/src/connection.hpp
index b9e40fb5..361aee20 100644
--- a/src/connection.hpp
+++ b/src/connection.hpp
@@ -15,6 +15,8 @@ public:
using id_type = cell_member_type;
using time_type = Time;
+ connection()=default;
+
connection(id_type src, id_type dest, float w, time_type d) :
source_(src),
destination_(dest),
diff --git a/src/threading/omp.hpp b/src/threading/omp.hpp
new file mode 100644
index 00000000..ad43e82b
--- /dev/null
+++ b/src/threading/omp.hpp
@@ -0,0 +1,163 @@
+#pragma once
+
+#if !defined(WITH_OMP)
+ #error "this header can only be loaded if WITH_OMP is set"
+#endif
+
+#include <omp.h>
+#include "parallel_stable_sort.h"
+
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <string>
+#include <vector>
+
+namespace nest {
+namespace mc {
+namespace threading {
+
+///////////////////////////////////////////////////////////////////////
+// types
+///////////////////////////////////////////////////////////////////////
+template <typename T>
+class enumerable_thread_specific {
+ using storage_class = std::vector<T>;
+ storage_class data;
+public :
+ using iterator = typename storage_class::iterator;
+ using const_iterator = typename storage_class::const_iterator;
+
+ enumerable_thread_specific() {
+ data = std::vector<T>(omp_get_max_threads());
+ }
+
+ enumerable_thread_specific(const T& init) {
+ data = std::vector<T>(omp_get_max_threads(), init);
+ }
+
+ T& local() { return data[omp_get_thread_num()]; }
+ const T& local() const { return data[omp_get_thread_num()]; }
+
+ auto size() -> decltype(data.size()) const { return data.size(); }
+
+ iterator begin() { return data.begin(); }
+ iterator end() { return data.end(); }
+
+ const_iterator begin() const { return data.begin(); }
+ const_iterator end() const { return data.end(); }
+
+ const_iterator cbegin() const { return data.cbegin(); }
+ const_iterator cend() const { return data.cend(); }
+};
+
+///////////////////////////////////////////////////////////////////////
+// algorithms
+///////////////////////////////////////////////////////////////////////
+struct parallel_for {
+ template <typename F>
+ static void apply(int left, int right, F f) {
+ #pragma omp parallel for
+ for(int i=left; i<right; ++i) {
+ f(i);
+ }
+ }
+};
+
+template <typename RandomIt>
+void sort(RandomIt begin, RandomIt end) {
+ pss::parallel_stable_sort(begin, end);
+}
+
+template <typename RandomIt, typename Compare>
+void sort(RandomIt begin, RandomIt end, Compare comp) {
+ pss::parallel_stable_sort(begin, end ,comp);
+}
+
+template <typename Container>
+void sort(Container& c) {
+ pss::parallel_stable_sort(c.begin(), c.end());
+}
+
+
+template <typename T>
+class parallel_vector {
+ using value_type = T;
+ std::vector<value_type> data_;
+public:
+ parallel_vector() = default;
+ using iterator = typename std::vector<value_type>::iterator;
+ using const_iterator = typename std::vector<value_type>::const_iterator;
+
+ iterator begin() { return data_.begin(); }
+ iterator end() { return data_.end(); }
+
+ const_iterator begin() const { return data_.begin(); }
+ const_iterator end() const { return data_.end(); }
+
+ const_iterator cbegin() const { return data_.cbegin(); }
+ const_iterator cend() const { return data_.cend(); }
+
+ void push_back (const value_type& val) {
+ #pragma omp critical
+ data_.push_back(val);
+ }
+
+ void push_back (value_type&& val) {
+ #pragma omp critical
+ data_.push_back(std::move(val));
+ }
+};
+
+inline std::string description() {
+ return "OpenMP";
+}
+
+struct timer {
+ using time_point = std::chrono::time_point<std::chrono::system_clock>;
+
+ static inline time_point tic() {
+ return std::chrono::system_clock::now();
+ }
+
+ static inline double toc(time_point t) {
+ return std::chrono::duration<double>(tic() - t).count();
+ }
+
+ static inline double difference(time_point b, time_point e) {
+ return std::chrono::duration<double>(e-b).count();
+ }
+};
+
+constexpr bool multithreaded() { return true; }
+
+
+class task_group {
+public:
+ task_group() = default;
+
+ template<typename Func>
+ void run(const Func& f) {
+ f();
+ }
+
+ template<typename Func>
+ void run_and_wait(const Func& f) {
+ f();
+ }
+
+ void wait()
+ {}
+
+ bool is_canceling() {
+ return false;
+ }
+
+ void cancel()
+ {}
+};
+
+} // threading
+} // mc
+} // nest
+
diff --git a/src/threading/parallel_stable_sort.h b/src/threading/parallel_stable_sort.h
new file mode 100644
index 00000000..caf010dd
--- /dev/null
+++ b/src/threading/parallel_stable_sort.h
@@ -0,0 +1,127 @@
+/*
+ Copyright (C) 2014 Intel Corporation
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <algorithm>
+#include <omp.h>
+
+#include "pss_common.h"
+
+namespace pss {
+
+namespace internal {
+
+// Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,zs+(xe-xs)+(ye-ys))
+// Destroy input sequence iff destroy==true
+template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
+#if __INTEL_COMPILER<=1500
+// Work around bug where firstprivate applied to formal parameter does not work.
+void parallel_move_merge( RandomAccessIterator1 xs_, RandomAccessIterator1 xe, RandomAccessIterator2 ys_, RandomAccessIterator2 ye, RandomAccessIterator3 zs_, bool destroy, Compare comp ) {
+ RandomAccessIterator1 xs = xs_;
+ RandomAccessIterator2 ys = ys_;
+ RandomAccessIterator3 zs = zs_;
+#else
+void parallel_move_merge( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp ) {
+#endif
+ const int MERGE_CUT_OFF = 2000;
+ while( (xe-xs) + (ye-ys) > MERGE_CUT_OFF ) {
+ RandomAccessIterator1 xm;
+ RandomAccessIterator2 ym;
+ if( xe-xs < ye-ys ) {
+ ym = ys+(ye-ys)/2;
+ xm = std::upper_bound(xs,xe,*ym,comp);
+ } else {
+ xm = xs+(xe-xs)/2;
+ ym = std::lower_bound(ys,ye,*xm,comp);
+ }
+#pragma omp task untied mergeable firstprivate(xs,xm,ys,ym,zs,destroy,comp)
+ parallel_move_merge( xs, xm, ys, ym, zs, destroy, comp );
+ zs += (xm-xs) + (ym-ys);
+ xs = xm;
+ ys = ym;
+ }
+ serial_move_merge( xs, xe, ys, ye, zs, comp );
+ if( destroy ) {
+ serial_destroy( xs, xe );
+ serial_destroy( ys, ye );
+ }
+#pragma omp taskwait
+}
+
+// Sorts [xs,xe), where zs[0:xe-xs) is temporary buffer supplied by caller.
+// Result is in [xs,xe) if inplace==true, otherwise in [zs,zs+(xe-xs))
+template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
+void parallel_stable_sort_aux( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp ) {
+ //typedef typename std::iterator_traits<RandomAccessIterator2>::value_type T;
+ const int SORT_CUT_OFF = 500;
+ if( xe-xs<=SORT_CUT_OFF ) {
+ stable_sort_base_case(xs, xe, zs, inplace, comp);
+ } else {
+ RandomAccessIterator1 xm = xs + (xe-xs)/2;
+ RandomAccessIterator2 zm = zs + (xm-xs);
+ RandomAccessIterator2 ze = zs + (xe-xs);
+#pragma omp task
+ parallel_stable_sort_aux( xs, xm, zs, !inplace, comp );
+ parallel_stable_sort_aux( xm, xe, zm, !inplace, comp );
+#pragma omp taskwait
+ if( inplace )
+ parallel_move_merge( zs, zm, zm, ze, xs, inplace==2, comp );
+ else
+ parallel_move_merge( xs, xm, xm, xe, zs, false, comp );
+ }
+}
+
+} // namespace internal
+
+
+
+template<typename RandomAccessIterator, typename Compare>
+void parallel_stable_sort( RandomAccessIterator xs, RandomAccessIterator xe, Compare comp ) {
+ typedef typename std::iterator_traits<RandomAccessIterator>::value_type T;
+ if( internal::raw_buffer z = internal::raw_buffer( sizeof(T)*(xe-xs) ) )
+ if( omp_get_num_threads() > 1 )
+ internal::parallel_stable_sort_aux( xs, xe, (T*)z.get(), 2, comp );
+ else
+ #pragma omp parallel
+ #pragma omp master
+ internal::parallel_stable_sort_aux( xs, xe, (T*)z.get(), 2, comp );
+ else
+ // Not enough memory available - fall back on serial sort
+ std::stable_sort( xs, xe, comp );
+}
+
+template<class RandomAccessIterator>
+void parallel_stable_sort( RandomAccessIterator xs, RandomAccessIterator xe ) {
+ typedef typename std::iterator_traits<RandomAccessIterator>::value_type T;
+ parallel_stable_sort( xs, xe, std::less<T>() );
+}
+
+
+} // namespace pss
diff --git a/src/threading/pss_common.h b/src/threading/pss_common.h
new file mode 100644
index 00000000..d49f6dbf
--- /dev/null
+++ b/src/threading/pss_common.h
@@ -0,0 +1,110 @@
+/*
+ Copyright (C) 2014 Intel Corporation
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <utility>
+#include <iterator>
+#include <algorithm>
+
+namespace pss {
+
+namespace internal {
+
+//! Destroy sequence [xs,xe)
+template<class RandomAccessIterator>
+void serial_destroy( RandomAccessIterator zs, RandomAccessIterator ze ) {
+ typedef typename std::iterator_traits<RandomAccessIterator>::value_type T;
+ while( zs!=ze ) {
+ --ze;
+ (*ze).~T();
+ }
+}
+
+//! Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,(xe-xs)+(ye-ys)), using std::move
+template<class RandomAccessIterator1, class RandomAccessIterator2, class RandomAccessIterator3, class Compare>
+void serial_move_merge( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, Compare comp ) {
+ if( xs!=xe ) {
+ if( ys!=ye )
+ for(;;)
+ if( comp(*ys,*xs) ) {
+ *zs = std::move(*ys);
+ ++zs;
+ if( ++ys==ye ) break;
+ } else {
+ *zs = std::move(*xs);
+ ++zs;
+ if( ++xs==xe ) goto movey;
+ }
+ ys = xs;
+ ye = xe;
+ }
+movey:
+ std::move( ys, ye, zs );
+}
+
+template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
+void stable_sort_base_case( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp) {
+ std::sort( xs, xe, comp );
+ if( inplace!=2 ) {
+ RandomAccessIterator2 ze = zs + (xe-xs);
+ typedef typename std::iterator_traits<RandomAccessIterator2>::value_type T;
+ if( inplace )
+ // Initialize the temporary buffer
+ for( ; zs<ze; ++zs )
+ new(&*zs) T;
+ else
+ // Initialize the temporary buffer and move keys to it.
+ for( ; zs<ze; ++xs, ++zs )
+ new(&*zs) T(std::move(*xs));
+ }
+}
+
+//! Raw memory buffer with automatic cleanup.
+class raw_buffer {
+ void* ptr;
+public:
+ //! Try to obtain buffer of given size.
+ raw_buffer( size_t bytes ) : ptr( operator new(bytes,std::nothrow) ) {}
+ //! True if buffer was successfully obtained, zero otherwise.
+ operator bool() const {return ptr;}
+ //! Return pointer to buffer, or NULL if buffer could not be obtained.
+ void* get() const {return ptr;}
+ //! Destroy buffer
+ ~raw_buffer() {operator delete(ptr);}
+};
+
+} // namespace internal
+
+
+/*template<typename T>
+void parallel_stable_sort(typename std::iterator<T> i,typename std::iterator<T> j){
+ parallel_stable_sort( i, j, std::less<T>() );
+}*/
+} // namespace pss
diff --git a/src/threading/threading.hpp b/src/threading/threading.hpp
index 76431d7d..26b2bace 100644
--- a/src/threading/threading.hpp
+++ b/src/threading/threading.hpp
@@ -2,6 +2,8 @@
#if defined(WITH_TBB)
#include "tbb.hpp"
+#elif defined(WITH_OMP)
+ #include "omp.hpp"
#else
#define WITH_SERIAL
#include "serial.hpp"
diff --git a/tests/unit/test_spike_store.cpp b/tests/unit/test_spike_store.cpp
index dd075f87..cf5e5290 100644
--- a/tests/unit/test_spike_store.cpp
+++ b/tests/unit/test_spike_store.cpp
@@ -1,5 +1,6 @@
#include "gtest.h"
+#include <threading/threading.hpp>
#include <thread_private_spike_store.hpp>
TEST(spike_store, insert)
--
GitLab