diff --git a/CMakeLists.txt b/CMakeLists.txt index c89d2d979e30142e96da464c9d9d46ec9aec9f2e..29a2c9066886554fc561f0a645f15459fbea99d9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -130,7 +130,7 @@ install(TARGETS arbor-config-defs EXPORT arbor-targets) # for the arbor library. add_library(arbor-private-deps INTERFACE) -target_link_libraries(arbor-private-deps INTERFACE arbor-config-defs) +target_link_libraries(arbor-private-deps INTERFACE arbor-config-defs ext-random123) install(TARGETS arbor-private-deps EXPORT arbor-targets) # Interface library `arborenv-private-deps` collects dependencies, options etc. diff --git a/arbor/include/arbor/morph/embed_pwlin.hpp b/arbor/include/arbor/morph/embed_pwlin.hpp index 82d83a570bc6f95d163acd8d838869f1a47a9591..d6947ac87ed594ea52918aaf5e59296120b7c9fc 100644 --- a/arbor/include/arbor/morph/embed_pwlin.hpp +++ b/arbor/include/arbor/morph/embed_pwlin.hpp @@ -28,6 +28,10 @@ struct embed_pwlin { // Interpolated radius at location. double radius(mlocation) const; + mcable_list radius_cmp(msize_t bid, double rad_lim, comp_op op) const; + + double directed_projection(mlocation) const; + mcable_list projection_cmp(msize_t bid, double proj_lim, comp_op op) const; // Computed length of mcable. double integrate_length(mcable c) const; diff --git a/arbor/include/arbor/morph/locset.hpp b/arbor/include/arbor/morph/locset.hpp index d4785d89a8e9bd53260ff89cadaed3f56083d505..3429717526f9a75a4a1c10de62bb2342bf8b1f43 100644 --- a/arbor/include/arbor/morph/locset.hpp +++ b/arbor/include/arbor/morph/locset.hpp @@ -118,8 +118,9 @@ private: }; }; -namespace ls { +class region; +namespace ls { // Explicit location on morphology. locset location(msize_t branch, double pos); @@ -138,6 +139,16 @@ locset named(std::string); // The null (empty) set. locset nil(); +// Most distal points of a region +locset most_distal(region reg); + +// Most proximal point of a region +locset most_proximal(region reg); + +// A range `left` to `right` of randomly selected locations with a +// uniform distribution from region `reg` generated using `seed` +locset uniform(region reg, unsigned left, unsigned right, uint64_t seed); + // Proportional location on every branch. locset on_branches(double pos); diff --git a/arbor/include/arbor/morph/primitives.hpp b/arbor/include/arbor/morph/primitives.hpp index 4b9e223b993db2659e94c00d78796c9b5f53f76c..fa6cb3eab733136757acfc7a715a0b090a746b0a 100644 --- a/arbor/include/arbor/morph/primitives.hpp +++ b/arbor/include/arbor/morph/primitives.hpp @@ -28,6 +28,14 @@ mpoint lerp(const mpoint& a, const mpoint& b, double u); bool is_collocated(const mpoint& a, const mpoint& b); double distance(const mpoint& a, const mpoint& b); +// Indicate allowed comparison operations for classifying regions +enum class comp_op { + lt, + le, + gt, + ge +}; + // A morphology sample consists of a location and an integer tag. // When loaded from an SWC file, the tag will correspond to the SWC label, // which are standardised as follows: diff --git a/arbor/include/arbor/morph/region.hpp b/arbor/include/arbor/morph/region.hpp index bd049fd68225f314ca2d92e1cf1e0bce33a774d1..629c7345b05e040f14baccecd2a08e65ca8190e6 100644 --- a/arbor/include/arbor/morph/region.hpp +++ b/arbor/include/arbor/morph/region.hpp @@ -116,6 +116,8 @@ private: }; }; +class locset; + namespace reg { // An empty region. @@ -130,6 +132,28 @@ region branch(msize_t); // Region with all segments with segment tag id. region tagged(int id); +// Region with all segments distal from another region +region distal_interval(locset start, double distance); + +// Region with all segments proximal from another region +region proximal_interval(locset end, double distance); + +// Region with all segments with radius less than/less than or equal to r +region radius_lt(region reg, double r); +region radius_le(region reg, double r); + +// Region with all segments with radius greater than/greater than or equal to r +region radius_gt(region reg, double r); +region radius_ge(region reg, double r); + +// Region with all segments with projection less than/less than or equal to r +region z_dist_from_soma_lt(double r); +region z_dist_from_soma_le(double r); + +// Region with all segments with projection greater than/greater than or equal to r +region z_dist_from_soma_gt(double r); +region z_dist_from_soma_ge(double r); + // Region with all segments in a cell. region all(); diff --git a/arbor/morph/embed_pwlin.cpp b/arbor/morph/embed_pwlin.cpp index e04436b4598224ab3a975ac60f5c527e40fd11b2..d81e9e4c6145c40c4223ed7149b7200abc3c33fc 100644 --- a/arbor/morph/embed_pwlin.cpp +++ b/arbor/morph/embed_pwlin.cpp @@ -53,14 +53,48 @@ double integrate(const branch_pw_ratpoly<p, q>& f, unsigned bid, const pw_consta return accum; } +template <typename operation> +mcable_list data_cmp(const branch_pw_ratpoly<1, 0>& f, unsigned bid, double val, operation op) { + mcable_list L; + const auto& pw = f.at(bid); + for (const auto& piece: pw) { + auto extents = piece.first; + auto left_val = piece.second(0); + auto right_val = piece.second(1); + + if (!op(left_val, val) && !op(right_val, val)) { + continue; + } + if (op(left_val, val) && op(right_val, val)) { + L.push_back({bid, extents.first, extents.second}); + continue; + } + + auto cable_loc = (val - left_val)/(right_val - left_val); + auto edge = math::lerp(extents.first, extents.second, cable_loc); + + if (op(left_val, val)) { + L.push_back({bid, extents.first, edge}); + continue; + } + if (!op(left_val, val)) { + L.push_back({bid, edge, extents.second}); + continue; + } + } + return L; +} + struct embed_pwlin_data { branch_pw_ratpoly<1, 0> length; + branch_pw_ratpoly<1, 0> directed_projection; branch_pw_ratpoly<1, 0> radius; branch_pw_ratpoly<2, 0> area; branch_pw_ratpoly<1, 1> ixa; explicit embed_pwlin_data(msize_t n_branch): length(n_branch), + directed_projection(n_branch), radius(n_branch), area(n_branch), ixa(n_branch) @@ -71,6 +105,10 @@ double embed_pwlin::radius(mlocation loc) const { return interpolate(data_->radius, loc.branch, loc.pos); } +double embed_pwlin::directed_projection(arb::mlocation loc) const { + return interpolate(data_->directed_projection, loc.branch, loc.pos); +} + double embed_pwlin::integrate_length(msize_t bid, const pw_constant_fn& g) const { return integrate(data_->length, bid, g); } @@ -97,6 +135,26 @@ double embed_pwlin::integrate_ixa(mcable c) const { return integrate_ixa(c.branch, pw_constant_fn{{c.prox_pos, c.dist_pos}, {1.}}); } +mcable_list embed_pwlin::radius_cmp(msize_t bid, double val, comp_op op) const { + switch (op) { + case comp_op::lt: return data_cmp(data_->radius, bid, val, [](auto l, auto r){return l < r;}); + case comp_op::le: return data_cmp(data_->radius, bid, val, [](auto l, auto r){return l <= r;}); + case comp_op::gt: return data_cmp(data_->radius, bid, val, [](auto l, auto r){return l > r;}); + case comp_op::ge: return data_cmp(data_->radius, bid, val, [](auto l, auto r){return l >= r;}); + default: return {}; + } +} + +mcable_list embed_pwlin::projection_cmp(msize_t bid, double val, comp_op op) const { + switch (op) { + case comp_op::lt: return data_cmp(data_->directed_projection, bid, val, [](auto l, auto r){return l < r;}); + case comp_op::le: return data_cmp(data_->directed_projection, bid, val, [](auto l, auto r){return l <= r;}); + case comp_op::gt: return data_cmp(data_->directed_projection, bid, val, [](auto l, auto r){return l > r;}); + case comp_op::ge: return data_cmp(data_->directed_projection, bid, val, [](auto l, auto r){return l >= r;}); + default: return {}; + } +} + // Initialization, creation of geometric data. embed_pwlin::embed_pwlin(const arb::morphology& m) { @@ -109,16 +167,18 @@ embed_pwlin::embed_pwlin(const arb::morphology& m) { const auto& samples = m.samples(); sample_locations_.resize(m.num_samples()); + double proj_shift = samples.front().loc.z; + for (msize_t bid = 0; bid<n_branch; ++bid) { unsigned parent = m.branch_parent(bid); auto sample_indices = util::make_range(m.branch_indexes(bid)); - if (bid==0 && m.spherical_root()) { arb_assert(sample_indices.size()==1); // Treat spherical root as area-equivalent cylinder. double r = samples[0].loc.radius; + data_->directed_projection[bid].push_back(0., 1., rat_element<1, 0>(-r, r)); data_->length[bid].push_back(0., 1., rat_element<1, 0>(0, r*2)); data_->radius[bid].push_back(0., 1., rat_element<1, 0>(r, r)); @@ -155,13 +215,15 @@ embed_pwlin::embed_pwlin(const arb::morphology& m) { double length_0 = parent==mnpos? 0: data_->length[parent].back().second[1]; data_->length[bid].push_back(0., 1, rat_element<1, 0>(length_0, length_0+branch_length)); - double area_0 = parent=mnpos? 0: data_->area[parent].back().second[1]; - double ixa_0 = parent=mnpos? 0: data_->ixa[parent].back().second[1]; + double area_0 = parent==mnpos? 0: data_->area[parent].back().second[1]; + double ixa_0 = parent==mnpos? 0: data_->ixa[parent].back().second[1]; if (length_scale==0) { // Zero-length branch? Weird, but make best show of it. double r = samples[sample_indices[0]].loc.radius; + double z = samples[sample_indices[0]].loc.z; data_->radius[bid].push_back(0., 1., rat_element<1, 0>(r, r)); + data_->directed_projection[bid].push_back(0., 1., rat_element<1, 0>(z-proj_shift, z-proj_shift)); data_->area[bid].push_back(0., 1., rat_element<2, 0>(area_0, area_0, area_0)); data_->ixa[bid].push_back(0., 1., rat_element<1, 1>(ixa_0, ixa_0, ixa_0)); } @@ -173,6 +235,10 @@ embed_pwlin::embed_pwlin(const arb::morphology& m) { double p1 = sample_locations_[sample_indices[i]].pos; if (p0==p1) continue; + double z0 = samples[sample_indices[i-1]].loc.z - proj_shift; + double z1 = samples[sample_indices[i]].loc.z - proj_shift; + data_->directed_projection[bid].push_back(p0, p1, rat_element<1, 0>(z0, z1)); + double r0 = samples[sample_indices[i-1]].loc.radius; double r1 = samples[sample_indices[i]].loc.radius; data_->radius[bid].push_back(p0, p1, rat_element<1, 0>(r0, r1)); diff --git a/arbor/morph/locset.cpp b/arbor/morph/locset.cpp index 1f7eafaa123195bda775cca63d6a4c638029b1ff..19b50d84e20745ee7ff2acedba3d7ddf2c7bb54a 100644 --- a/arbor/morph/locset.cpp +++ b/arbor/morph/locset.cpp @@ -2,13 +2,19 @@ #include <iostream> #include <numeric> +#include <arbor/math.hpp> #include <arbor/morph/locset.hpp> #include <arbor/morph/morphexcept.hpp> #include <arbor/morph/morphology.hpp> #include <arbor/morph/mprovider.hpp> #include <arbor/morph/primitives.hpp> +#include <arbor/morph/region.hpp> +#include "util/cbrng.hpp" +#include "util/partition.hpp" #include "util/rangeutil.hpp" +#include "util/transform.hpp" +#include "util/span.hpp" #include "util/strprintf.hpp" namespace arb { @@ -160,6 +166,125 @@ std::ostream& operator<<(std::ostream& o, const named_& x) { return o << "(locset \"" << x.name << "\")"; } +// Most distal points of a region + +struct most_distal_: locset_tag { + explicit most_distal_(region reg): reg(std::move(reg)) {} + region reg; +}; + +locset most_distal(region reg) { + return locset(most_distal_{std::move(reg)}); +} + +mlocation_list thingify_(const most_distal_& n, const mprovider& p) { + mlocation_list L; + + auto cables = thingify(n.reg, p); + util::sort(cables, [](const auto& l, const auto& r){return (l.branch < r.branch) && (l.dist_pos < r.dist_pos);}); + + std::unordered_set<msize_t> branches_visited; + for (auto it= cables.rbegin(); it!= cables.rend(); it++) { + auto bid = (*it).branch; + auto pos = (*it).dist_pos; + + // Check if any other points on the branch or any of its children has been added as a distal point + if (branches_visited.count(bid)) continue; + L.push_back({bid, pos}); + while (bid != mnpos) { + branches_visited.insert(bid); + bid = p.morphology().branch_parent(bid); + } + } + + util::sort(L); + return L; +} + +std::ostream& operator<<(std::ostream& o, const most_distal_& x) { + return o << "(locset \"" << x.reg << "\")"; +} + +// Most distal points of a region + +struct most_proximal_: locset_tag { + explicit most_proximal_(region reg): reg(std::move(reg)) {} + region reg; +}; + +locset most_proximal(region reg) { + return locset(most_proximal_{std::move(reg)}); +} + +mlocation_list thingify_(const most_proximal_& n, const mprovider& p) { + auto cables = thingify(n.reg, p); + arb_assert(test_invariants(cables)); + + auto most_prox = cables.front(); + return {{most_prox.branch, most_prox.prox_pos}}; +} + +std::ostream& operator<<(std::ostream& o, const most_proximal_& x) { + return o << "(locset \"" << x.reg << "\")"; +} + + +// Uniform locset. + +struct uniform_ { + region reg; + unsigned left; + unsigned right; + uint64_t seed; +}; + +locset uniform(arb::region reg, unsigned left, unsigned right, uint64_t seed) { + return locset(uniform_{reg, left, right, seed}); +} + +mlocation_list thingify_(const uniform_& u, const mprovider& p) { + mlocation_list L; + auto morpho = p.morphology(); + auto embed = p.embedding(); + + // Thingify the region and store relevant data + auto reg_cables = thingify(u.reg, p); + + std::vector<double> lengths_bounds; + auto lengths_part = util::make_partition(lengths_bounds, + util::transform_view(reg_cables, [&embed](const auto& c) { + return embed.integrate_length(c); + })); + + auto region_length = lengths_part.bounds().second; + + // Generate uniform random positions along the extent of the full region + auto random_pos = util::uniform(u.seed, u.left, u.right); + std::transform(random_pos.begin(), random_pos.end(), random_pos.begin(), + [®ion_length](auto& c){return c*region_length;}); + util::sort(random_pos); + + // Match random_extents to cables and find position on the associated branch + unsigned cable_idx = 0; + auto range = lengths_part[cable_idx]; + + for (auto e: random_pos) { + while (e > range.second) { + range = lengths_part[++cable_idx]; + } + auto cable = reg_cables[cable_idx]; + auto pos_on_cable = (e - range.first)/(range.second - range.first); + auto pos_on_branch = math::lerp(cable.prox_pos, cable.dist_pos, pos_on_cable); + L.push_back({cable.branch, pos_on_branch}); + } + + return L; +} + +std::ostream& operator<<(std::ostream& o, const uniform_& u) { + return o << "(uniform from region: \"" << u.reg << "\"; using seed: " << u.seed + << "; range: {" << u.left << ", " << u.right << "})"; +} // Intersection of two point sets. diff --git a/arbor/morph/region.cpp b/arbor/morph/region.cpp index 3db12f50fb317606054a817f66befb99ca12446f..9aafa123c7beacdcf28f3b542a272093427e658b 100644 --- a/arbor/morph/region.cpp +++ b/arbor/morph/region.cpp @@ -1,6 +1,7 @@ #include <set> #include <string> #include <vector> +#include <stack> #include <arbor/morph/locset.hpp> #include <arbor/morph/primitives.hpp> @@ -163,6 +164,7 @@ mcable_list remove_covered_points(mcable_list cables, const morphology& m) { } } } + util::sort(erase_indices); for (auto it = erase_indices.rbegin(); it != erase_indices.rend(); it++) { cables.erase(cables.begin() + *it); @@ -283,7 +285,6 @@ std::ostream& operator<<(std::ostream& o, const tagged_& t) { return o << "(tag " << t.tag << ")"; } - // Region comprising whole morphology. struct all_: region_tag {}; @@ -306,9 +307,323 @@ std::ostream& operator<<(std::ostream& o, const all_& t) { return o << "(all)"; } +// Region with all segments distal from another region -// Named region. +struct distal_interval_ { + locset start; + double distance; //um +}; + +region distal_interval(locset start, double distance) { + return region(distal_interval_{start, distance}); +} + +mcable_list thingify_(const distal_interval_& reg, const mprovider& p) { + const auto& m = p.morphology(); + const auto& e = p.embedding(); + + std::vector<mcable> L; + + auto start = thingify(reg.start, p); + auto distance = reg.distance; + + struct branch_interval { + msize_t bid; + double distance; + }; + + for (auto c: start) { + std::stack<branch_interval> branches_reached; + bool first_branch = true; + + // if we're starting at the end of a branch, start traversal with its children + if (c.pos < 1) { + branches_reached.push({c.branch, distance}); + } else { + first_branch = false; + L.push_back({c.branch,1,1}); + for (auto child: m.branch_children(c.branch)) { + branches_reached.push({child, distance}); + } + } + + while (!branches_reached.empty()) { + auto bi = branches_reached.top(); + branches_reached.pop(); + + auto branch = bi.bid; + auto rem_dist = bi.distance; + + auto branch_length = e.branch_length(branch); + auto prox_pos = first_branch*c.pos; + auto dist_pos = rem_dist / branch_length + prox_pos; + + if (dist_pos <= 1) { + L.push_back({branch, prox_pos, dist_pos}); + } else { + L.push_back({branch, prox_pos, 1}); + rem_dist = rem_dist - (1 - prox_pos)*branch_length; + for (auto child: m.branch_children(branch)) { + branches_reached.push({child, rem_dist}); + } + } + first_branch = false; + } + } + return remove_covered_points(remove_cover(L, m), m); +} + +std::ostream& operator<<(std::ostream& o, const distal_interval_& d) { + return o << "(distal_interval: " << d.start << ", " << d.distance << ")"; +} + +// Region with all segments proximal from another region + +struct proximal_interval_ { + locset end; + double distance; //um +}; + +region proximal_interval(locset end, double distance) { + return region(proximal_interval_{end, distance}); +} + +mcable_list thingify_(const proximal_interval_& reg, const mprovider& p) { + const auto& m = p.morphology(); + const auto& e = p.embedding(); + + std::vector<mcable> L; + + auto start = thingify(reg.end, p); + auto distance = reg.distance; + + for (auto c: start) { + auto branch = c.branch; + auto branch_length = e.branch_length(branch); + auto rem_dist = distance; + + auto dist_pos = c.pos; + auto prox_pos = dist_pos - distance / branch_length; + + while (prox_pos < 0) { + L.push_back({branch, 0, dist_pos}); + + rem_dist = rem_dist - dist_pos*branch_length; + + branch = m.branch_parent(branch); + if (branch == mnpos) { + break; + } + + dist_pos = 1; + prox_pos = dist_pos - rem_dist / e.branch_length(branch); + } + if (branch != mnpos) { + L.push_back({branch, prox_pos, dist_pos}); + } + } + return remove_cover(L, m); +} + +std::ostream& operator<<(std::ostream& o, const proximal_interval_& d) { + return o << "(distal_interval: " << d.end << ", " << d.distance << ")"; +} + +mcable_list radius_cmp(const mprovider& p, region r, double v, comp_op op) { + const auto& e = p.embedding(); + + std::vector<mcable> L; + auto reg = thingify(r, p); + auto val = v; + for (auto c: reg) { + for (auto r: e.radius_cmp(c.branch, val, op)) { + if (is_disjoint(c, r)) continue; + L.push_back(make_intersection(c, r)); + } + } + return remove_cover(L, p.morphology()); +} + +// Region with all segments with radius less than r +struct radius_lt_ { + region reg; + double val; //um +}; + +region radius_lt(region reg, double val) { + return region(radius_lt_{reg, val}); +} + +mcable_list thingify_(const radius_lt_& r, const mprovider& p) { + return radius_cmp(p, r.reg, r.val, comp_op::lt); +} + +std::ostream& operator<<(std::ostream& o, const radius_lt_& r) { + return o << "(radius_lt: " << r.reg << ", " << r.val << ")"; +} + +// Region with all segments with radius less than r +struct radius_le_ { + region reg; + double val; //um +}; + +region radius_le(region reg, double val) { + return region(radius_le_{reg, val}); +} +mcable_list thingify_(const radius_le_& r, const mprovider& p) { + return radius_cmp(p, r.reg, r.val, comp_op::le); +} + +std::ostream& operator<<(std::ostream& o, const radius_le_& r) { + return o << "(radius_le: " << r.reg << ", " << r.val << ")"; +} + +// Region with all segments with radius greater than r +struct radius_gt_ { + region reg; + double val; //um +}; + +region radius_gt(region reg, double val) { + return region(radius_gt_{reg, val}); +} + +mcable_list thingify_(const radius_gt_& r, const mprovider& p) { + return radius_cmp(p, r.reg, r.val, comp_op::gt); +} + +std::ostream& operator<<(std::ostream& o, const radius_gt_& r) { + return o << "(radius_gt: " << r.reg << ", " << r.val << ")"; +} + +// Region with all segments with radius greater than or equal to r +struct radius_ge_ { + region reg; + double val; //um +}; + +region radius_ge(region reg, double val) { + return region(radius_gt_{reg, val}); +} + +mcable_list thingify_(const radius_ge_& r, const mprovider& p) { + return radius_cmp(p, r.reg, r.val, comp_op::ge); +} + +std::ostream& operator<<(std::ostream& o, const radius_ge_& r) { + return o << "(radius_ge: " << r.reg << ", " << r.val << ")"; +} + +mcable_list projection_cmp(const mprovider& p, double v, comp_op op) { + const auto& m = p.morphology(); + const auto& e = p.embedding(); + + std::vector<mcable> L; + auto val = v; + for (auto i: util::make_span(m.num_branches())) { + util::append(L, e.projection_cmp(i, val, op)); + } + return remove_cover(L, p.morphology()); +} + +// Region with all segments with projection less than val +struct projection_lt_{ + double val; //um +}; + +region projection_lt(double val) { + return region(projection_lt_{val}); +} + +mcable_list thingify_(const projection_lt_& r, const mprovider& p) { + return projection_cmp(p, r.val, comp_op::lt); +} + +std::ostream& operator<<(std::ostream& o, const projection_lt_& r) { + return o << "(projection_lt: " << r.val << ")"; +} + +// Region with all segments with projection less than or equal to val +struct projection_le_{ + double val; //um +}; + +region projection_le(double val) { + return region(projection_le_{val}); +} + +mcable_list thingify_(const projection_le_& r, const mprovider& p) { + return projection_cmp(p, r.val, comp_op::le); +} + +std::ostream& operator<<(std::ostream& o, const projection_le_& r) { + return o << "(projection_le: " << r.val << ")"; +} + +// Region with all segments with projection greater than val +struct projection_gt_ { + double val; //um +}; + +region projection_gt(double val) { + return region(projection_gt_{val}); +} + +mcable_list thingify_(const projection_gt_& r, const mprovider& p) { + return projection_cmp(p, r.val, comp_op::gt); +} + +std::ostream& operator<<(std::ostream& o, const projection_gt_& r) { + return o << "(projection_gt: " << r.val << ")"; +} + +// Region with all segments with projection greater than val +struct projection_ge_ { + double val; //um +}; + +region projection_ge(double val) { + return region(projection_ge_{val}); +} + +mcable_list thingify_(const projection_ge_& r, const mprovider& p) { + return projection_cmp(p, r.val, comp_op::ge); +} + +std::ostream& operator<<(std::ostream& o, const projection_ge_& r) { + return o << "(projection_ge: " << r.val << ")"; +} + +region z_dist_from_soma_lt(double r0) { + if (r0 == 0) { + return {}; + } + region lt = reg::projection_lt(r0); + region gt = reg::projection_gt(-r0); + return intersect(std::move(lt), std::move(gt)); +} + +region z_dist_from_soma_le(double r0) { + region le = reg::projection_le(r0); + region ge = reg::projection_ge(-r0); + return intersect(std::move(le), std::move(ge)); +} + +region z_dist_from_soma_gt(double r0) { + region lt = reg::projection_lt(-r0); + region gt = reg::projection_gt(r0); + return region{join(std::move(lt), std::move(gt))}; +} + +region z_dist_from_soma_ge(double r0) { + region lt = reg::projection_le(-r0); + region gt = reg::projection_ge(r0); + return region{join(std::move(lt), std::move(gt))}; +} + +// Named region. struct named_: region_tag { explicit named_(std::string name): name(std::move(name)) {} std::string name; diff --git a/arbor/util/cbrng.hpp b/arbor/util/cbrng.hpp new file mode 100644 index 0000000000000000000000000000000000000000..6f78e44942c6f8ba882265f2b9d669c6541a3191 --- /dev/null +++ b/arbor/util/cbrng.hpp @@ -0,0 +1,41 @@ +#pragma once +#include <vector> + +#include <Random123/threefry.h> +#include <Random123/uniform.hpp> + +namespace arb { +namespace util { + +std::vector<double> uniform(uint64_t seed, unsigned left, unsigned right) { + typedef r123::Threefry2x64 cbrng; + std::vector<double> r; + + cbrng::key_type key = {{seed}}; + cbrng::ctr_type ctr = {{0,0}}; + cbrng g; + + unsigned i = left; + if (i%2 && i<=right) { + ctr[0] = i/2; + cbrng::ctr_type rand = g(ctr, key); + r.push_back(r123::u01<double>(rand[1]));; + ++i; + } + while (i < 2*((right+1)/2)) { + ctr[0] = i/2; + cbrng::ctr_type rand = g(ctr, key); + r.push_back(r123::u01<double>(rand[0])); + r.push_back(r123::u01<double>(rand[1])); + i += 2; + } + if (i<=right) { + ctr[0] = i/2; + cbrng::ctr_type rand = g(ctr, key); + r.push_back(r123::u01<double>(rand[0])); + } + return r; +} + +} +} \ No newline at end of file diff --git a/ext/CMakeLists.txt b/ext/CMakeLists.txt index 49aa903e132f58a154f43f7f64c3b22320ef7171..634a28898410c4bbd6d3217dc9d7a04655dd82f1 100644 --- a/ext/CMakeLists.txt +++ b/ext/CMakeLists.txt @@ -8,6 +8,12 @@ target_include_directories(ext-json INTERFACE json/single_include) add_library(ext-tclap INTERFACE) target_include_directories(ext-tclap INTERFACE tclap/include) +# Random123 (DE Shaw Research) counter-based random number generators (header-only) + +add_library(ext-random123 INTERFACE) +target_include_directories(ext-random123 INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>/random123/include) +install(TARGETS ext-random123 EXPORT arbor-targets) + # Google benchmark for microbenchmarks: check_git_submodule(gbench google-benchmark) diff --git a/ext/random123/LICENSE b/ext/random123/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..c6094acaf20c26df516f01549101dabc7fde5c93 --- /dev/null +++ b/ext/random123/LICENSE @@ -0,0 +1,31 @@ +/** @page LICENSE +Copyright 2010-2012, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ diff --git a/ext/random123/include/Random123/MicroURNG.hpp b/ext/random123/include/Random123/MicroURNG.hpp new file mode 100644 index 0000000000000000000000000000000000000000..9ea77325514c07ab3e4393467dba018e42c4f412 --- /dev/null +++ b/ext/random123/include/Random123/MicroURNG.hpp @@ -0,0 +1,146 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __MicroURNG_dot_hpp__ +#define __MicroURNG_dot_hpp__ + +#include <stdexcept> +#include <limits> + +namespace r123{ +/** + Given a CBRNG whose ctr_type has an unsigned integral value_type, + MicroURNG<CBRNG>(c, k) is a type that satisfies the + requirements of a C++11 Uniform Random Number Generator. + + The intended purpose is for a MicroURNG to be passed + as an argument to a C++11 Distribution, e.g., + std::normal_distribution. See examples/MicroURNG.cpp. + + The MicroURNG functor has a period of "only" + + ctr_type.size()*2^32, + + after which it will silently repeat. + + The high 32 bits of the highest word in the counter c, passed to + the constructor must be zero. MicroURNG uses these bits to + "count". + + Older versions of the library permitted a second template + parameter by which the caller could control the number of + bits devoted to the URNG's internal counter. This flexibility + has been disabled because URNGs created with different + numbers of counter bits could, conceivably "collide". + +\code + typedef ?someCBRNG? RNG; + RNG::ctr_type c = ...; // under application control + RNG::key_type k = ...; // + std::normal_distribution<float> nd; + MicroURNG<RNG> urng(c, k); + for(???){ + ... + nd(urng); // may be called several hundred times with BITS=10 + ... + } +\endcode +*/ + +template<typename CBRNG> +class MicroURNG{ + // According to C++11, a URNG requires only a result_type, + // operator()(), min() and max() methods. Everything else + // (ctr_type, key_type, reset() method, etc.) is "value added" + // for the benefit of users that "know" that they're dealing with + // a MicroURNG. +public: + typedef CBRNG cbrng_type; + static const int BITS = 32; + typedef typename cbrng_type::ctr_type ctr_type; + typedef typename cbrng_type::key_type key_type; + typedef typename cbrng_type::ukey_type ukey_type; + typedef typename ctr_type::value_type result_type; + + R123_STATIC_ASSERT( std::numeric_limits<result_type>::digits >= BITS, "The result_type must have at least 32 bits" ); + + result_type operator()(){ + if(last_elem == 0){ + // jam n into the high bits of c + const size_t W = std::numeric_limits<result_type>::digits; + ctr_type c = c0; + c[c0.size()-1] |= n<<(W-BITS); + rdata = b(c,k); + n++; + last_elem = rdata.size(); + } + return rdata[--last_elem]; + } + MicroURNG(cbrng_type _b, ctr_type _c0, ukey_type _uk) : b(_b), c0(_c0), k(_uk), n(0), last_elem(0) { + chkhighbits(); + } + MicroURNG(ctr_type _c0, ukey_type _uk) : b(), c0(_c0), k(_uk), n(0), last_elem(0) { + chkhighbits(); + } + + // _Min and _Max work around a bug in the library shipped with MacOS Xcode 4.5.2. + // See the commment in conventional/Engine.hpp. + const static result_type _Min = 0; + const static result_type _Max = ~((result_type)0); + + static R123_CONSTEXPR result_type min R123_NO_MACRO_SUBST () { return _Min; } + static R123_CONSTEXPR result_type max R123_NO_MACRO_SUBST () { return _Max; } + // extra methods: + const ctr_type& counter() const{ return c0; } + void reset(ctr_type _c0, ukey_type _uk){ + c0 = _c0; + chkhighbits(); + k = _uk; + n = 0; + last_elem = 0; + } + +private: + cbrng_type b; + ctr_type c0; + key_type k; + R123_ULONG_LONG n; + size_t last_elem; + ctr_type rdata; + void chkhighbits(){ + result_type r = c0[c0.size()-1]; + result_type mask = ((uint64_t)std::numeric_limits<result_type>::max R123_NO_MACRO_SUBST ())>>BITS; + if((r&mask) != r) + throw std::runtime_error("MicroURNG: c0, does not have high bits clear"); + } +}; +} // namespace r123 +#endif diff --git a/ext/random123/include/Random123/ReinterpretCtr.hpp b/ext/random123/include/Random123/ReinterpretCtr.hpp new file mode 100644 index 0000000000000000000000000000000000000000..164a38b0a56bc09232d9a7caa8601c29ebc2b0a9 --- /dev/null +++ b/ext/random123/include/Random123/ReinterpretCtr.hpp @@ -0,0 +1,88 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __ReinterpretCtr_dot_hpp__ +#define __ReinterpretCtr_dot_hpp__ + +#include "features/compilerfeatures.h" +#include <cstring> + +namespace r123{ +/*! + ReinterpretCtr uses memcpy to map back and forth + between a CBRNG's ctr_type and the specified ToType. For example, + after: + + typedef ReinterpretCtr<r123array4x32, Philox2x64> G; + + G is a bona fide CBRNG with ctr_type r123array4x32. + + WARNING: ReinterpretCtr is endian dependent. The + values returned by G, declared as above, + will depend on the endianness of the machine on which it runs. + */ + +template <typename ToType, typename CBRNG> +struct ReinterpretCtr{ + typedef ToType ctr_type; + typedef typename CBRNG::key_type key_type; + typedef typename CBRNG::ctr_type bctype; + typedef typename CBRNG::ukey_type ukey_type; + R123_STATIC_ASSERT(sizeof(ToType) == sizeof(bctype) && sizeof(typename bctype::value_type) != 16, + "ReinterpretCtr: sizeof(ToType) is not the same as sizeof(CBRNG::ctr_type) or CBRNG::ctr_type::value_type looks like it might be __m128i"); + // It's amazingly difficult to safely do conversions with __m128i. + // If we use the operator() implementation below with a CBRNG + // whose ctr_type is r123array1xm128i, gcc4.6 optimizes away the + // memcpys, inlines the operator()(c,k), and produces assembly + // language that ends with an aesenclast instruction with a + // destination operand pointing to an unaligned memory address ... + // Segfault! See: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50444 + // MSVC also produces code that crashes. We suspect a + // similar mechanism but haven't done the debugging necessary to + // be sure. We were able to 'fix' gcc4.6 by making bc a mutable + // data member rather than declaring it in the scope of + // operator(). That didn't fix the MSVC problems, though. + // + // Conclusion - don't touch __m128i, at least for now. The + // easiest (but highly imprecise) way to do that is the static + // assertion above that rejects bctype::value_types of size 16. - + // Sep 2011. + ctr_type operator()(ctr_type c, key_type k){ + bctype bc; + std::memcpy(&bc, &c, sizeof(c)); + CBRNG b; + bc = b(bc, k); + std::memcpy(&c, &bc, sizeof(bc)); + return c; + } +}; +} // namespace r123 +#endif diff --git a/ext/random123/include/Random123/aes.h b/ext/random123/include/Random123/aes.h new file mode 100644 index 0000000000000000000000000000000000000000..3095fac37de5d051b602fb53eb887ebf1ac59863 --- /dev/null +++ b/ext/random123/include/Random123/aes.h @@ -0,0 +1,398 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __Random123_aes_dot_hpp__ +#define __Random123_aes_dot_hpp__ + +#include "features/compilerfeatures.h" +#include "array.h" + +/* Implement a bona fide AES block cipher. It's minimally +// checked against the test vector in FIPS-197 in ut_aes.cpp. */ +#if R123_USE_AES_NI + +/** @ingroup AESNI */ +typedef struct r123array1xm128i aesni1xm128i_ctr_t; +/** @ingroup AESNI */ +typedef struct r123array1xm128i aesni1xm128i_ukey_t; +/** @ingroup AESNI */ +typedef struct r123array4x32 aesni4x32_ukey_t; +/** @ingroup AESNI */ +enum r123_enum_aesni1xm128i { aesni1xm128i_rounds = 10 }; + +/** \cond HIDDEN_FROM_DOXYGEN */ +R123_STATIC_INLINE __m128i AES_128_ASSIST (__m128i temp1, __m128i temp2) { + __m128i temp3; + temp2 = _mm_shuffle_epi32 (temp2 ,0xff); + temp3 = _mm_slli_si128 (temp1, 0x4); + temp1 = _mm_xor_si128 (temp1, temp3); + temp3 = _mm_slli_si128 (temp3, 0x4); + temp1 = _mm_xor_si128 (temp1, temp3); + temp3 = _mm_slli_si128 (temp3, 0x4); + temp1 = _mm_xor_si128 (temp1, temp3); + temp1 = _mm_xor_si128 (temp1, temp2); + return temp1; +} + +R123_STATIC_INLINE void aesni1xm128iexpand(aesni1xm128i_ukey_t uk, __m128i ret[11]) +{ + __m128i rkey = uk.v[0].m; + __m128i tmp2; + + ret[0] = rkey; + tmp2 = _mm_aeskeygenassist_si128(rkey, 0x1); + rkey = AES_128_ASSIST(rkey, tmp2); + ret[1] = rkey; + + tmp2 = _mm_aeskeygenassist_si128(rkey, 0x2); + rkey = AES_128_ASSIST(rkey, tmp2); + ret[2] = rkey; + + tmp2 = _mm_aeskeygenassist_si128(rkey, 0x4); + rkey = AES_128_ASSIST(rkey, tmp2); + ret[3] = rkey; + + tmp2 = _mm_aeskeygenassist_si128(rkey, 0x8); + rkey = AES_128_ASSIST(rkey, tmp2); + ret[4] = rkey; + + tmp2 = _mm_aeskeygenassist_si128(rkey, 0x10); + rkey = AES_128_ASSIST(rkey, tmp2); + ret[5] = rkey; + + tmp2 = _mm_aeskeygenassist_si128(rkey, 0x20); + rkey = AES_128_ASSIST(rkey, tmp2); + ret[6] = rkey; + + tmp2 = _mm_aeskeygenassist_si128(rkey, 0x40); + rkey = AES_128_ASSIST(rkey, tmp2); + ret[7] = rkey; + + tmp2 = _mm_aeskeygenassist_si128(rkey, 0x80); + rkey = AES_128_ASSIST(rkey, tmp2); + ret[8] = rkey; + + tmp2 = _mm_aeskeygenassist_si128(rkey, 0x1b); + rkey = AES_128_ASSIST(rkey, tmp2); + ret[9] = rkey; + + tmp2 = _mm_aeskeygenassist_si128(rkey, 0x36); + rkey = AES_128_ASSIST(rkey, tmp2); + ret[10] = rkey; +} +/** \endcond */ + +#ifdef __cplusplus +/** @ingroup AESNI */ +struct aesni1xm128i_key_t{ + __m128i k[11]; + aesni1xm128i_key_t(){ + aesni1xm128i_ukey_t uk; + uk.v[0].m = _mm_setzero_si128(); + aesni1xm128iexpand(uk, k); + } + aesni1xm128i_key_t(const aesni1xm128i_ukey_t& uk){ + aesni1xm128iexpand(uk, k); + } + aesni1xm128i_key_t(const aesni4x32_ukey_t& uk){ + aesni1xm128i_ukey_t uk128; + uk128.v[0].m = _mm_set_epi32(uk.v[3], uk.v[2], uk.v[1], uk.v[0]); + aesni1xm128iexpand(uk128, k); + } + aesni1xm128i_key_t& operator=(const aesni1xm128i_ukey_t& uk){ + aesni1xm128iexpand(uk, k); + return *this; + } + aesni1xm128i_key_t& operator=(const aesni4x32_ukey_t& uk){ + aesni1xm128i_ukey_t uk128; + uk128.v[0].m = _mm_set_epi32(uk.v[3], uk.v[2], uk.v[1], uk.v[0]); + aesni1xm128iexpand(uk128, k); + return *this; + } + bool operator==(const aesni1xm128i_key_t& rhs) const{ + for(int i=0; i<11; ++i){ + // Sigh... No r123m128i(__m128i) constructor! + r123m128i li; li.m = k[i]; + r123m128i ri; ri.m = rhs.k[i]; + if( li != ri ) return false; + } + return true; + } + bool operator!=(const aesni1xm128i_key_t& rhs) const{ + return !(*this == rhs); + } + friend std::ostream& operator<<(std::ostream& os, const aesni1xm128i_key_t& v){ + r123m128i ki; + for(int i=0; i<10; ++i){ + ki.m = v.k[i]; + os << ki << " "; + } + ki.m = v.k[10]; + return os << ki; + } + friend std::istream& operator>>(std::istream& is, aesni1xm128i_key_t& v){ + r123m128i ki; + for(int i=0; i<11; ++i){ + is >> ki; + v.k[i] = ki; + } + return is; + } +}; +#else +typedef struct { + __m128i k[11]; +}aesni1xm128i_key_t; + +/** @ingroup AESNI */ +R123_STATIC_INLINE aesni1xm128i_key_t aesni1xm128ikeyinit(aesni1xm128i_ukey_t uk){ + aesni1xm128i_key_t ret; + aesni1xm128iexpand(uk, ret.k); + return ret; +} +#endif + +/** @ingroup AESNI */ +R123_STATIC_INLINE aesni1xm128i_ctr_t aesni1xm128i(aesni1xm128i_ctr_t in, aesni1xm128i_key_t k) { + __m128i x = _mm_xor_si128(k.k[0], in.v[0].m); + x = _mm_aesenc_si128(x, k.k[1]); + x = _mm_aesenc_si128(x, k.k[2]); + x = _mm_aesenc_si128(x, k.k[3]); + x = _mm_aesenc_si128(x, k.k[4]); + x = _mm_aesenc_si128(x, k.k[5]); + x = _mm_aesenc_si128(x, k.k[6]); + x = _mm_aesenc_si128(x, k.k[7]); + x = _mm_aesenc_si128(x, k.k[8]); + x = _mm_aesenc_si128(x, k.k[9]); + x = _mm_aesenclast_si128(x, k.k[10]); + { + aesni1xm128i_ctr_t ret; + ret.v[0].m = x; + return ret; + } +} + +/** @ingroup AESNI */ +R123_STATIC_INLINE aesni1xm128i_ctr_t aesni1xm128i_R(unsigned R, aesni1xm128i_ctr_t in, aesni1xm128i_key_t k){ + R123_ASSERT(R==10); + return aesni1xm128i(in, k); +} + + +/** @ingroup AESNI */ +typedef struct r123array4x32 aesni4x32_ctr_t; +/** @ingroup AESNI */ +typedef aesni1xm128i_key_t aesni4x32_key_t; +/** @ingroup AESNI */ +enum r123_enum_aesni4x32 { aesni4x32_rounds = 10 }; +/** @ingroup AESNI */ +R123_STATIC_INLINE aesni4x32_key_t aesni4x32keyinit(aesni4x32_ukey_t uk){ + aesni1xm128i_ukey_t uk128; + aesni4x32_key_t ret; + uk128.v[0].m = _mm_set_epi32(uk.v[3], uk.v[2], uk.v[1], uk.v[0]); + aesni1xm128iexpand(uk128, ret.k); + return ret; +} + +/** @ingroup AESNI */ +/** The aesni4x32_R function provides a C API to the @ref AESNI "AESNI" CBRNG, allowing the number of rounds to be specified explicitly **/ +R123_STATIC_INLINE aesni4x32_ctr_t aesni4x32_R(unsigned int Nrounds, aesni4x32_ctr_t c, aesni4x32_key_t k){ + aesni1xm128i_ctr_t c128; + c128.v[0].m = _mm_set_epi32(c.v[3], c.v[2], c.v[1], c.v[0]); + c128 = aesni1xm128i_R(Nrounds, c128, k); + _mm_storeu_si128((__m128i*)&c.v[0], c128.v[0].m); + return c; +} + +#define aesni4x32_rounds aesni1xm128i_rounds + +/** The aesni4x32 macro provides a C API to the @ref AESNI "AESNI" CBRNG, uses the default number of rounds i.e. \c aesni4x32_rounds **/ +/** @ingroup AESNI */ +#define aesni4x32(c,k) aesni4x32_R(aesni4x32_rounds, c, k) + +#ifdef __cplusplus +namespace r123{ +/** +@defgroup AESNI ARS and AESNI Classes and Typedefs + +The ARS4x32, ARS1xm128i, AESNI4x32 and AESNI1xm128i classes export the member functions, typedefs and +operator overloads required by a @ref CBRNG "CBRNG" class. + +ARS1xm128i and AESNI1xm128i are based on the AES block cipher and rely on the AES-NI hardware instructions +available on some some new (2011) CPUs. + +The ARS1xm128i CBRNG and the use of AES for random number generation are described in +<a href="http://dl.acm.org/citation.cfm?doid=2063405"><i>Parallel Random Numbers: As Easy as 1, 2, 3</i> </a>. +Although it uses some cryptographic primitives, ARS1xm128i uses a cryptographically weak key schedule and is \b not suitable for cryptographic use. + +@class AESNI1xm128i +@ingroup AESNI +AESNI exports the member functions, typedefs and operator overloads required by a @ref CBRNG class. + +AESNI1xm128i uses the crypotgraphic AES round function, including the cryptographic key schedule. + +In contrast to the other CBRNGs in the Random123 library, the AESNI1xm128i_R::key_type is opaque +and is \b not identical to the AESNI1xm128i_R::ukey_type. Creating a key_type, using either the constructor +or assignment operator, is significantly more time-consuming than running the bijection (hundreds +of clock cycles vs. tens of clock cycles). + +AESNI1xm128i is only available when the feature-test macro R123_USE_AES_NI is true, which +should occur only when the compiler is configured to generate AES-NI instructions (or +when defaults are overridden by compile-time, compiler-command-line options). + +As of September 2011, the authors know of no statistical flaws with AESNI1xm128i. It +would be an event of major cryptographic note if any such flaws were ever found. +*/ +struct AESNI1xm128i{ + typedef aesni1xm128i_ctr_t ctr_type; + typedef aesni1xm128i_ukey_t ukey_type; + typedef aesni1xm128i_key_t key_type; + static const unsigned int rounds=10; + ctr_type operator()(ctr_type ctr, key_type key) const{ + return aesni1xm128i(ctr, key); + } +}; + +/* @class AESNI4x32 */ +struct AESNI4x32{ + typedef aesni4x32_ctr_t ctr_type; + typedef aesni4x32_ukey_t ukey_type; + typedef aesni4x32_key_t key_type; + static const unsigned int rounds=10; + ctr_type operator()(ctr_type ctr, key_type key) const{ + return aesni4x32(ctr, key); + } +}; + +/** @ingroup AESNI + @class AESNI1xm128i_R + +AESNI1xm128i_R is provided for completeness, but is only instantiable with ROUNDS=10, in +which case it is identical to AESNI1xm128i */ +template <unsigned ROUNDS=10> +struct AESNI1xm128i_R : public AESNI1xm128i{ + R123_STATIC_ASSERT(ROUNDS==10, "AESNI1xm128i_R<R> is only valid with R=10"); +}; + +/** @class AESNI4x32_R **/ +template <unsigned ROUNDS=10> +struct AESNI4x32_R : public AESNI4x32{ + R123_STATIC_ASSERT(ROUNDS==10, "AESNI4x32_R<R> is only valid with R=10"); +}; +} // namespace r123 +#endif /* __cplusplus */ + +#endif /* R123_USE_AES_NI */ + +#if R123_USE_AES_OPENSSL +#include "string.h" +#include <openssl/aes.h> +typedef struct r123array16x8 aesopenssl16x8_ctr_t; +typedef struct r123array16x8 aesopenssl16x8_ukey_t; +#ifdef __cplusplus +struct aesopenssl16x8_key_t{ + AES_KEY k; + aesopenssl16x8_key_t(){ + aesopenssl16x8_ukey_t ukey={{}}; + AES_set_encrypt_key((const unsigned char *)&ukey.v[0], 128, &k); + } + aesopenssl16x8_key_t(const aesopenssl16x8_ukey_t& ukey){ + AES_set_encrypt_key((const unsigned char *)&ukey.v[0], 128, &k); + } + aesopenssl16x8_key_t& operator=(const aesopenssl16x8_ukey_t& ukey){ + AES_set_encrypt_key((const unsigned char *)&ukey.v[0], 128, &k); + return *this; + } + bool operator==(const aesopenssl16x8_key_t& rhs) const{ + return (k.rounds == rhs.k.rounds) && 0==::memcmp(&k.rd_key[0], &rhs.k.rd_key[0], (k.rounds+1) * 4 * sizeof(uint32_t)); + } + bool operator!=(const aesopenssl16x8_key_t& rhs) const{ + return !(*this == rhs); + } + friend std::ostream& operator<<(std::ostream& os, const aesopenssl16x8_key_t& v){ + os << v.k.rounds; + const unsigned int *p = &v.k.rd_key[0]; + for(int i=0; i<(v.k.rounds+1); ++i){ + os << " " << p[0] << " " << p[1] << " " << p[2] << " " << p[3]; + p += 4; + } + return os; + } + friend std::istream& operator>>(std::istream& is, aesopenssl16x8_key_t& v){ + is >> v.k.rounds; + unsigned int *p = &v.k.rd_key[0]; + for(int i=0; i<(v.k.rounds+1); ++i){ + is >> p[0] >> p[1] >> p[2] >> p[3]; + p += 4; + } + return is; + } +}; +#else +typedef struct aesopenssl16x8_key_t{ + AES_KEY k; +}aesopenssl16x8_key_t; +R123_STATIC_INLINE struct aesopenssl16x8_key_t aesopenssl16x8keyinit(aesopenssl16x8_ukey_t uk){ + aesopenssl16x8_key_t ret; + AES_set_encrypt_key((const unsigned char *)&uk.v[0], 128, &ret.k); + return ret; +} +#endif + +R123_STATIC_INLINE R123_FORCE_INLINE(aesopenssl16x8_ctr_t aesopenssl16x8_R(aesopenssl16x8_ctr_t ctr, aesopenssl16x8_key_t key)); +R123_STATIC_INLINE +aesopenssl16x8_ctr_t aesopenssl16x8_R(aesopenssl16x8_ctr_t ctr, aesopenssl16x8_key_t key){ + aesopenssl16x8_ctr_t ret; + AES_encrypt((const unsigned char*)&ctr.v[0], (unsigned char *)&ret.v[0], &key.k); + return ret; +} + +#define aesopenssl16x8_rounds aesni4x32_rounds +#define aesopenssl16x8(c,k) aesopenssl16x8_R(aesopenssl16x8_rounds) + +#ifdef __cplusplus +namespace r123{ +struct AESOpenSSL16x8{ + typedef aesopenssl16x8_ctr_t ctr_type; + typedef aesopenssl16x8_key_t key_type; + typedef aesopenssl16x8_ukey_t ukey_type; + static const unsigned int rounds=10; + ctr_type operator()(const ctr_type& in, const key_type& k){ + ctr_type out; + AES_encrypt((const unsigned char *)&in[0], (unsigned char *)&out[0], &k.k); + return out; + } +}; +} // namespace r123 +#endif /* __cplusplus */ +#endif /* R123_USE_AES_OPENSSL */ + +#endif diff --git a/ext/random123/include/Random123/array.h b/ext/random123/include/Random123/array.h new file mode 100644 index 0000000000000000000000000000000000000000..c560c3fee1a860f5c8f1785c1b181961be285226 --- /dev/null +++ b/ext/random123/include/Random123/array.h @@ -0,0 +1,348 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _r123array_dot_h__ +#define _r123array_dot_h__ +#include "features/compilerfeatures.h" +#include "features/sse.h" + +#if !defined(__cplusplus) || defined(__METAL_MACOS__) +#define CXXMETHODS(_N, W, T) +#define CXXOVERLOADS(_N, W, T) +#define CXXMETHODS_REQUIRING_STL +#else + +#include <stddef.h> +#include <algorithm> +#include <stdexcept> +#include <iterator> +#include <limits> +#include <iostream> + +/** @defgroup arrayNxW The r123arrayNxW classes + + Each of the r123arrayNxW is a fixed size array of N W-bit unsigned integers. + It is functionally equivalent to the C++11 std::array<N, uintW_t>, + but does not require C++11 features or libraries. + + In addition to meeting most of the requirements of a Container, + it also has a member function, incr(), which increments the zero-th + element and carrys overflows into higher indexed elements. Thus, + by using incr(), sequences of up to 2^(N*W) distinct values + can be produced. + + If SSE is supported by the compiler, then the class + r123array1xm128i is also defined, in which the data member is an + array of one r123m128i object. + + When compiling with __CUDA_ARCH__ defined, the reverse iterator + methods (rbegin, rend, crbegin, crend) are not defined because + CUDA does not support std::reverse_iterator. + +*/ + +/** @cond HIDDEN_FROM_DOXYGEN */ + +template <typename value_type> +inline R123_CUDA_DEVICE value_type assemble_from_u32(uint32_t *p32){ + value_type v=0; + for(size_t i=0; i<(3+sizeof(value_type))/4; ++i) + v |= ((value_type)(*p32++)) << (32*i); + return v; +} + +/** @endcond */ + +#ifdef __CUDA_ARCH__ +/* CUDA can't handle std::reverse_iterator. We *could* implement it + ourselves, but let's not bother until somebody really feels a need + to reverse-iterate through an r123array */ +#define CXXMETHODS_REQUIRING_STL +#else +#define CXXMETHODS_REQUIRING_STL \ + public: \ + typedef std::reverse_iterator<iterator> reverse_iterator; \ + typedef std::reverse_iterator<const_iterator> const_reverse_iterator; \ + R123_CUDA_DEVICE reverse_iterator rbegin(){ return reverse_iterator(end()); } \ + R123_CUDA_DEVICE const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); } \ + R123_CUDA_DEVICE reverse_iterator rend(){ return reverse_iterator(begin()); } \ + R123_CUDA_DEVICE const_reverse_iterator rend() const{ return const_reverse_iterator(begin()); } \ + R123_CUDA_DEVICE const_reverse_iterator crbegin() const{ return const_reverse_iterator(cend()); } \ + R123_CUDA_DEVICE const_reverse_iterator crend() const{ return const_reverse_iterator(cbegin()); } +#endif + +// Work-alike methods and typedefs modeled on std::array: +#define CXXMETHODS(_N, W, T) \ + typedef T value_type; \ + typedef T* iterator; \ + typedef const T* const_iterator; \ + typedef value_type& reference; \ + typedef const value_type& const_reference; \ + typedef size_t size_type; \ + typedef ptrdiff_t difference_type; \ + typedef T* pointer; \ + typedef const T* const_pointer; \ + /* Boost.array has static_size. C++11 specializes tuple_size */ \ + enum {static_size = _N}; \ + R123_CUDA_DEVICE reference operator[](size_type i){return v[i];} \ + R123_CUDA_DEVICE const_reference operator[](size_type i) const {return v[i];} \ + R123_CUDA_DEVICE reference at(size_type i){ if(i >= _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \ + R123_CUDA_DEVICE const_reference at(size_type i) const { if(i >= _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \ + R123_CUDA_DEVICE size_type size() const { return _N; } \ + R123_CUDA_DEVICE size_type max_size() const { return _N; } \ + R123_CUDA_DEVICE bool empty() const { return _N==0; }; \ + R123_CUDA_DEVICE iterator begin() { return &v[0]; } \ + R123_CUDA_DEVICE iterator end() { return &v[_N]; } \ + R123_CUDA_DEVICE const_iterator begin() const { return &v[0]; } \ + R123_CUDA_DEVICE const_iterator end() const { return &v[_N]; } \ + R123_CUDA_DEVICE const_iterator cbegin() const { return &v[0]; } \ + R123_CUDA_DEVICE const_iterator cend() const { return &v[_N]; } \ + R123_CUDA_DEVICE pointer data(){ return &v[0]; } \ + R123_CUDA_DEVICE const_pointer data() const{ return &v[0]; } \ + R123_CUDA_DEVICE reference front(){ return v[0]; } \ + R123_CUDA_DEVICE const_reference front() const{ return v[0]; } \ + R123_CUDA_DEVICE reference back(){ return v[_N-1]; } \ + R123_CUDA_DEVICE const_reference back() const{ return v[_N-1]; } \ + R123_CUDA_DEVICE bool operator==(const r123array##_N##x##W& rhs) const{ \ + /* CUDA3 does not have std::equal */ \ + for (size_t i = 0; i < _N; ++i) \ + if (v[i] != rhs.v[i]) return false; \ + return true; \ + } \ + R123_CUDA_DEVICE bool operator!=(const r123array##_N##x##W& rhs) const{ return !(*this == rhs); } \ + /* CUDA3 does not have std::fill_n */ \ + R123_CUDA_DEVICE void fill(const value_type& val){ for (size_t i = 0; i < _N; ++i) v[i] = val; } \ + R123_CUDA_DEVICE void swap(r123array##_N##x##W& rhs){ \ + /* CUDA3 does not have std::swap_ranges */ \ + for (size_t i = 0; i < _N; ++i) { \ + T tmp = v[i]; \ + v[i] = rhs.v[i]; \ + rhs.v[i] = tmp; \ + } \ + } \ + R123_CUDA_DEVICE r123array##_N##x##W& incr(R123_ULONG_LONG n=1){ \ + /* This test is tricky because we're trying to avoid spurious \ + complaints about illegal shifts, yet still be compile-time \ + evaulated. */ \ + if(sizeof(T)<sizeof(n) && n>>((sizeof(T)<sizeof(n))?8*sizeof(T):0) ) \ + return incr_carefully(n); \ + if(n==1){ \ + ++v[0]; \ + if(_N==1 || R123_BUILTIN_EXPECT(!!v[0], 1)) return *this; \ + }else{ \ + v[0] += n; \ + if(_N==1 || R123_BUILTIN_EXPECT(n<=v[0], 1)) return *this; \ + } \ + /* We expect that the N==?? tests will be \ + constant-folded/optimized away by the compiler, so only the \ + overflow tests (!!v[i]) remain to be done at runtime. For \ + small values of N, it would be better to do this as an \ + uncondtional sequence of adc. An experiment/optimization \ + for another day... \ + N.B. The weird subscripting: v[_N>3?3:0] is to silence \ + a spurious error from icpc \ + */ \ + ++v[_N>1?1:0]; \ + if(_N==2 || R123_BUILTIN_EXPECT(!!v[_N>1?1:0], 1)) return *this; \ + ++v[_N>2?2:0]; \ + if(_N==3 || R123_BUILTIN_EXPECT(!!v[_N>2?2:0], 1)) return *this; \ + ++v[_N>3?3:0]; \ + for(size_t i=4; i<_N; ++i){ \ + if( R123_BUILTIN_EXPECT(!!v[i-1], 1) ) return *this; \ + ++v[i]; \ + } \ + return *this; \ + } \ + /* seed(SeedSeq) would be a constructor if having a constructor */ \ + /* didn't cause headaches with defaults */ \ + template <typename SeedSeq> \ + R123_CUDA_DEVICE static r123array##_N##x##W seed(SeedSeq &ss){ \ + r123array##_N##x##W ret; \ + const size_t Ngen = _N*((3+sizeof(value_type))/4); \ + uint32_t u32[Ngen]; \ + uint32_t *p32 = &u32[0]; \ + ss.generate(&u32[0], &u32[Ngen]); \ + for(size_t i=0; i<_N; ++i){ \ + ret.v[i] = assemble_from_u32<value_type>(p32); \ + p32 += (3+sizeof(value_type))/4; \ + } \ + return ret; \ + } \ +protected: \ + R123_CUDA_DEVICE r123array##_N##x##W& incr_carefully(R123_ULONG_LONG n){ \ + /* n may be greater than the maximum value of a single value_type */ \ + value_type vtn; \ + vtn = n; \ + v[0] += n; \ + const unsigned rshift = 8* ((sizeof(n)>sizeof(value_type))? sizeof(value_type) : 0); \ + for(size_t i=1; i<_N; ++i){ \ + if(rshift){ \ + n >>= rshift; \ + }else{ \ + n=0; \ + } \ + if( v[i-1] < vtn ) \ + ++n; \ + if( n==0 ) break; \ + vtn = n; \ + v[i] += n; \ + } \ + return *this; \ + } \ + +/** @cond HIDDEN_FROM_DOXYGEN */ + +// There are several tricky considerations for the insertion and extraction +// operators: +// - we would like to be able to print r123array16x8 as a sequence of 16 integers, +// not as 16 bytes. +// - we would like to be able to print r123array1xm128i. +// - we do not want an int conversion operator in r123m128i because it causes +// lots of ambiguity problems with automatic promotions. +// Solution: r123arrayinsertable and r123arrayextractable + +template<typename T> +struct r123arrayinsertable{ + const T& v; + r123arrayinsertable(const T& t_) : v(t_) {} + friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable<T>& t){ + return os << t.v; + } +}; + +template<> +struct r123arrayinsertable<uint8_t>{ + const uint8_t& v; + r123arrayinsertable(const uint8_t& t_) : v(t_) {} + friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable<uint8_t>& t){ + return os << (int)t.v; + } +}; + +template<typename T> +struct r123arrayextractable{ + T& v; + r123arrayextractable(T& t_) : v(t_) {} + friend std::istream& operator>>(std::istream& is, r123arrayextractable<T>& t){ + return is >> t.v; + } +}; + +template<> +struct r123arrayextractable<uint8_t>{ + uint8_t& v; + r123arrayextractable(uint8_t& t_) : v(t_) {} + friend std::istream& operator>>(std::istream& is, r123arrayextractable<uint8_t>& t){ + int i; + is >> i; + t.v = i; + return is; + } +}; +/** @endcond */ + +#define CXXOVERLOADS(_N, W, T) \ + \ +inline std::ostream& operator<<(std::ostream& os, const r123array##_N##x##W& a){ \ + os << r123arrayinsertable<T>(a.v[0]); \ + for(size_t i=1; i<_N; ++i) \ + os << " " << r123arrayinsertable<T>(a.v[i]); \ + return os; \ +} \ + \ +inline std::istream& operator>>(std::istream& is, r123array##_N##x##W& a){ \ + for(size_t i=0; i<_N; ++i){ \ + r123arrayextractable<T> x(a.v[i]); \ + is >> x; \ + } \ + return is; \ +} \ + \ +namespace r123{ \ + typedef r123array##_N##x##W Array##_N##x##W; \ +} + +#endif /* __cplusplus */ + +/* _r123array_tpl expands to a declaration of struct r123arrayNxW. + + In C, it's nothing more than a struct containing an array of N + objects of type T. + + In C++ it's the same, but endowed with an assortment of member + functions, typedefs and friends. In C++, r123arrayNxW looks a lot + like std::array<T,N>, has most of the capabilities of a container, + and satisfies the requirements outlined in compat/Engine.hpp for + counter and key types. ArrayNxW, in the r123 namespace is + a typedef equivalent to r123arrayNxW. +*/ + +#define _r123array_tpl(_N, W, T) \ + /** @ingroup arrayNxW */ \ + /** @see arrayNxW */ \ +struct r123array##_N##x##W{ \ + T v[_N]; \ + CXXMETHODS(_N, W, T) \ + CXXMETHODS_REQUIRING_STL \ +}; \ + \ +CXXOVERLOADS(_N, W, T) + + +_r123array_tpl(1, 32, uint32_t) /* r123array1x32 */ +_r123array_tpl(2, 32, uint32_t) /* r123array2x32 */ +_r123array_tpl(4, 32, uint32_t) /* r123array4x32 */ +_r123array_tpl(8, 32, uint32_t) /* r123array8x32 */ + +#if R123_USE_64BIT +_r123array_tpl(1, 64, uint64_t) /* r123array1x64 */ +_r123array_tpl(2, 64, uint64_t) /* r123array2x64 */ +_r123array_tpl(4, 64, uint64_t) /* r123array4x64 */ +#endif + +_r123array_tpl(16, 8, uint8_t) /* r123array16x8 for ARSsw, AESsw */ + +#if R123_USE_SSE +_r123array_tpl(1, m128i, r123m128i) /* r123array1x128i for ARSni, AESni */ +#endif + +/* In C++, it's natural to use sizeof(a::value_type), but in C it's + pretty convoluted to figure out the width of the value_type of an + r123arrayNxW: +*/ +#define R123_W(a) (8*sizeof(((a *)0)->v[0])) + +/** @namespace r123 + Most of the Random123 C++ API is contained in the r123 namespace. +*/ + +#endif + diff --git a/ext/random123/include/Random123/ars.h b/ext/random123/include/Random123/ars.h new file mode 100644 index 0000000000000000000000000000000000000000..a027b6fe043e55da528569cf4b22aa10296fd6a8 --- /dev/null +++ b/ext/random123/include/Random123/ars.h @@ -0,0 +1,204 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __Random123_ars_dot_hpp__ +#define __Random123_ars_dot_hpp__ + +#include "features/compilerfeatures.h" +#include "array.h" + +#if R123_USE_AES_NI + +#ifndef ARS1xm128i_DEFAULT_ROUNDS +#define ARS1xm128i_DEFAULT_ROUNDS 7 +#endif + +/** @ingroup AESNI */ +enum r123_enum_ars1xm128i {ars1xm128i_rounds = ARS1xm128i_DEFAULT_ROUNDS}; + +/* ARS1xm128i with Weyl keys. Fast, and Crush-resistant, but NOT CRYPTO. */ +/** @ingroup AESNI */ +typedef struct r123array1xm128i ars1xm128i_ctr_t; +/** @ingroup AESNI */ +typedef struct r123array1xm128i ars1xm128i_key_t; +/** @ingroup AESNI */ +typedef struct r123array1xm128i ars1xm128i_ukey_t; +/** @ingroup AESNI */ +R123_STATIC_INLINE ars1xm128i_key_t ars1xm128ikeyinit(ars1xm128i_ukey_t uk) { return uk; } +/** @ingroup AESNI */ +R123_STATIC_INLINE ars1xm128i_ctr_t ars1xm128i_R(unsigned int Nrounds, ars1xm128i_ctr_t in, ars1xm128i_key_t k){ + __m128i kweyl = _mm_set_epi64x(R123_64BIT(0xBB67AE8584CAA73B), /* sqrt(3) - 1.0 */ + R123_64BIT(0x9E3779B97F4A7C15)); /* golden ratio */ + /* N.B. the aesenc instructions do the xor *after* + // so if we want to follow the AES pattern, we + // have to do the initial xor explicitly */ + __m128i kk = k.v[0].m; + __m128i v = _mm_xor_si128(in.v[0].m, kk); + ars1xm128i_ctr_t ret; + R123_ASSERT(Nrounds<=10); + if( Nrounds>1 ){ + kk = _mm_add_epi64(kk, kweyl); + v = _mm_aesenc_si128(v, kk); + } + if( Nrounds>2 ){ + kk = _mm_add_epi64(kk, kweyl); + v = _mm_aesenc_si128(v, kk); + } + if( Nrounds>3 ){ + kk = _mm_add_epi64(kk, kweyl); + v = _mm_aesenc_si128(v, kk); + } + if( Nrounds>4 ){ + kk = _mm_add_epi64(kk, kweyl); + v = _mm_aesenc_si128(v, kk); + } + if( Nrounds>5 ){ + kk = _mm_add_epi64(kk, kweyl); + v = _mm_aesenc_si128(v, kk); + } + if( Nrounds>6 ){ + kk = _mm_add_epi64(kk, kweyl); + v = _mm_aesenc_si128(v, kk); + } + if( Nrounds>7 ){ + kk = _mm_add_epi64(kk, kweyl); + v = _mm_aesenc_si128(v, kk); + } + if( Nrounds>8 ){ + kk = _mm_add_epi64(kk, kweyl); + v = _mm_aesenc_si128(v, kk); + } + if( Nrounds>9 ){ + kk = _mm_add_epi64(kk, kweyl); + v = _mm_aesenc_si128(v, kk); + } + kk = _mm_add_epi64(kk, kweyl); + v = _mm_aesenclast_si128(v, kk); + ret.v[0].m = v; + return ret; +} + +/** @def ars1xm128i +@ingroup AESNI +The ars1mx128i macro provides a C API interface to the @ref AESNI "ARS" CBRNG with the default number of rounds i.e. \c ars1xm128i_rounds **/ +#define ars1xm128i(c,k) ars1xm128i_R(ars1xm128i_rounds, c, k) + +/** @ingroup AESNI */ +typedef struct r123array4x32 ars4x32_ctr_t; +/** @ingroup AESNI */ +typedef struct r123array4x32 ars4x32_key_t; +/** @ingroup AESNI */ +typedef struct r123array4x32 ars4x32_ukey_t; +/** @ingroup AESNI */ +enum r123_enum_ars4x32 {ars4x32_rounds = ARS1xm128i_DEFAULT_ROUNDS}; +/** @ingroup AESNI */ +R123_STATIC_INLINE ars4x32_key_t ars4x32keyinit(ars4x32_ukey_t uk) { return uk; } +/** @ingroup AESNI */ +R123_STATIC_INLINE ars4x32_ctr_t ars4x32_R(unsigned int Nrounds, ars4x32_ctr_t c, ars4x32_key_t k){ + ars1xm128i_ctr_t c128; + ars1xm128i_key_t k128; + c128.v[0].m = _mm_set_epi32(c.v[3], c.v[2], c.v[1], c.v[0]); + k128.v[0].m = _mm_set_epi32(k.v[3], k.v[2], k.v[1], k.v[0]); + c128 = ars1xm128i_R(Nrounds, c128, k128); + _mm_storeu_si128((__m128i*)&c.v[0], c128.v[0].m); + return c; +} + +/** @def ars4x32 +@ingroup AESNI +The ars4x32 macro provides a C API interface to the @ref AESNI "ARS" CBRNG with the default number of rounds i.e. \c ars4x32_rounds **/ +#define ars4x32(c,k) ars4x32_R(ars4x32_rounds, c, k) + +#ifdef __cplusplus +namespace r123{ +/** +@ingroup AESNI + +ARS1xm128i_R exports the member functions, typedefs and operator overloads required by a @ref CBRNG class. + +ARS1xm128i uses the crypotgraphic AES round function, but a @b non-cryptographc key schedule +to save time and space. + +ARS1xm128i is only available when the feature-test macro R123_USE_AES_NI is true, which +should occur only when the compiler is configured to generate AES-NI instructions (or +when defaults are overridden by compile-time, compiler-command-line options). + +The template argument, ROUNDS, is the number of times the ARS round +functions will be applied. + +As of September 2011, the authors know of no statistical flaws with +ROUNDS=5 or more. + +@class ARS1xm128i_R + +*/ +template<unsigned int ROUNDS> +struct ARS1xm128i_R{ + typedef ars1xm128i_ctr_t ctr_type; + typedef ars1xm128i_key_t key_type; + typedef ars1xm128i_key_t ukey_type; + static const unsigned int rounds=ROUNDS; + R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key) const){ + return ars1xm128i_R(ROUNDS, ctr, key); + } +}; + +/** @class ARS4x32_R + @ingroup AESNI +*/ + +template<unsigned int ROUNDS> +struct ARS4x32_R{ + typedef ars4x32_ctr_t ctr_type; + typedef ars4x32_key_t key_type; + typedef ars4x32_key_t ukey_type; + static const unsigned int rounds=ROUNDS; + R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key) const){ + return ars4x32_R(ROUNDS, ctr, key); + } +}; +/** +@ingroup AESNI + +@class ARS1xm128i_R + ARS1xm128i is equivalent to ARS1xm128i_R<7>. With 7 rounds, + the ARS1xm128i CBRNG has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. */ +typedef ARS1xm128i_R<ars1xm128i_rounds> ARS1xm128i; +typedef ARS4x32_R<ars4x32_rounds> ARS4x32; +} // namespace r123 + +#endif /* __cplusplus */ + +#endif /* R123_USE_AES_NI */ + +#endif diff --git a/ext/random123/include/Random123/boxmuller.hpp b/ext/random123/include/Random123/boxmuller.hpp new file mode 100644 index 0000000000000000000000000000000000000000..9c91cf879109133a80844c7c69f26d8e448578fa --- /dev/null +++ b/ext/random123/include/Random123/boxmuller.hpp @@ -0,0 +1,139 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +// This file implements the Box-Muller method for generating gaussian +// random variables (GRVs). Box-Muller has the advantage of +// deterministically requiring exactly two uniform random variables as +// input and producing exactly two GRVs as output, which makes it +// especially well-suited to the counter-based generators in +// Random123. Other methods (e.g., Ziggurat, polar) require an +// indeterminate number of inputs for each output and so require a +// 'MicroURNG' to be used with Random123. The down side of Box-Muller +// is that it calls sincos, log and sqrt, which may be slow. However, +// on GPUs, these functions are remarkably fast, which makes +// Box-Muller the fastest GRV generator we know of on GPUs. +// +// This file exports two structs and one overloaded function, +// all in the r123 namespace: +// struct r123::float2{ float x,y; } +// struct r123::double2{ double x,y; } +// +// r123::float2 r123::boxmuller(uint32_t u0, uint32_t u1); +// r123::double2 r123::boxmuller(uint64_t u0, uint64_t u1); +// +// float2 and double2 are identical to their synonymous global- +// namespace structures in CUDA. +// +// This file may not be as portable, and has not been tested as +// rigorously as other files in the library, e.g., the generators. +// Nevertheless, we hope it is useful and we encourage developers to +// copy it and modify it for their own use. We invite comments and +// improvements. + +#ifndef _r123_BOXMULLER_HPP__ +#define _r123_BOXMULLER_HPP__ + +#include <Random123/features/compilerfeatures.h> +#include <Random123/uniform.hpp> +#include <math.h> + +namespace r123{ + +#if !defined(__CUDACC__) +typedef struct { float x, y; } float2; +typedef struct { double x, y; } double2; +#else +typedef ::float2 float2; +typedef ::double2 double2; +#endif + +#if !defined(R123_NO_SINCOS) && defined(__APPLE__) +/* MacOS X 10.10.5 (2015) doesn't have sincosf */ +#define R123_NO_SINCOS 1 +#endif + +#if R123_NO_SINCOS /* enable this if sincos and sincosf are not in the math library */ +R123_CUDA_DEVICE R123_STATIC_INLINE void sincosf(float x, float *s, float *c) { + *s = sinf(x); + *c = cosf(x); +} + +R123_CUDA_DEVICE R123_STATIC_INLINE void sincos(double x, double *s, double *c) { + *s = sin(x); + *c = cos(x); +} +#endif /* sincos is not in the math library */ + +#if !defined(CUDART_VERSION) || CUDART_VERSION < 5000 /* enabled if sincospi and sincospif are not in math lib */ + +R123_CUDA_DEVICE R123_STATIC_INLINE void sincospif(float x, float *s, float *c){ + const float PIf = 3.1415926535897932f; + sincosf(PIf*x, s, c); +} + +R123_CUDA_DEVICE R123_STATIC_INLINE void sincospi(double x, double *s, double *c) { + const double PI = 3.1415926535897932; + sincos(PI*x, s, c); +} +#endif /* sincospi is not in math lib */ + +/* + * take two 32bit unsigned random values and return a float2 with + * two random floats in a normal distribution via a Box-Muller transform + */ +R123_CUDA_DEVICE R123_STATIC_INLINE float2 boxmuller(uint32_t u0, uint32_t u1) { + float r; + float2 f; + sincospif(uneg11<float>(u0), &f.x, &f.y); + r = sqrtf(-2.f * logf(u01<float>(u1))); // u01 is guaranteed to avoid 0. + f.x *= r; + f.y *= r; + return f; +} + +/* + * take two 64bit unsigned random values and return a double2 with + * two random doubles in a normal distribution via a Box-Muller transform + */ +R123_CUDA_DEVICE R123_STATIC_INLINE double2 boxmuller(uint64_t u0, uint64_t u1) { + double r; + double2 f; + + sincospi(uneg11<double>(u0), &f.x, &f.y); + r = sqrt(-2. * log(u01<double>(u1))); // u01 is guaranteed to avoid 0. + f.x *= r; + f.y *= r; + return f; +} +} // namespace r123 + +#endif /* BOXMULLER_H__ */ diff --git a/ext/random123/include/Random123/conventional/Engine.hpp b/ext/random123/include/Random123/conventional/Engine.hpp new file mode 100644 index 0000000000000000000000000000000000000000..bd2da2e1911b789adbba32c51e9835128ed96842 --- /dev/null +++ b/ext/random123/include/Random123/conventional/Engine.hpp @@ -0,0 +1,276 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __Engine_dot_hpp_ +#define __Engine_dot_hpp_ + +#include "../features/compilerfeatures.h" +#include "../array.h" +#include <limits> +#include <stdexcept> +#include <sstream> +#include <algorithm> +#include <vector> +#if R123_USE_CXX11_TYPE_TRAITS +#include <type_traits> +#endif + +namespace r123{ +/** + If G satisfies the requirements of a CBRNG, and has a ctr_type whose + value_type is an unsigned integral type, then Engine<G> satisfies + the requirements of a C++11 "Uniform Random Number Engine" and can + be used in any context where such an object is expected. + + Note that wrapping a counter based RNG with a traditional API in + this way obscures much of the power of counter based PRNGs. + Nevertheless, it may be of value in applications that are already + coded to work with the C++11 random number engines. + + The MicroURNG template in MicroURNG.hpp + provides the more limited functionality of a C++11 "Uniform + Random Number Generator", but leaves the application in control + of counters and keys and hence may be preferable to the Engine template. + For example, a MicroURNG allows one to use C++11 "Random Number + Distributions" without giving up control over the counters + and keys. +*/ + +template<typename CBRNG> +struct Engine { + typedef CBRNG cbrng_type; + typedef typename CBRNG::ctr_type ctr_type; + typedef typename CBRNG::key_type key_type; + typedef typename CBRNG::ukey_type ukey_type; + typedef typename ctr_type::value_type result_type; + +protected: + cbrng_type b; + key_type key; + ctr_type c; + ctr_type v; + + void fix_invariant(){ + if( v.back() != 0 ) { + result_type vv = v.back(); + v = b(c, key); + v.back() = vv; + } + } +public: + explicit Engine() : b(), c() { + ukey_type x = {{}}; + v.back() = 0; + key = x; + } + explicit Engine(result_type r) : b(), c() { + ukey_type x = {{typename ukey_type::value_type(r)}}; + v.back() = 0; + key = x; + } + // 26.5.3 says that the SeedSeq templates shouldn't particpate in + // overload resolution unless the type qualifies as a SeedSeq. + // How that is determined is unspecified, except that "as a + // minimum a type shall not qualify as a SeedSeq if it is + // implicitly convertible to a result_type." + // + // First, we make sure that even the non-const copy constructor + // works as expected. In addition, if we've got C++11 + // type_traits, we use enable_if and is_convertible to implement + // the convertible-to-result_type restriction. Otherwise, the + // template is unconditional and will match in some surpirsing + // and undesirable situations. + Engine(Engine& e) : b(e.b), key(e.key), c(e.c){ + v.back() = e.v.back(); + fix_invariant(); + } + Engine(const Engine& e) : b(e.b), key(e.key), c(e.c){ + v.back() = e.v.back(); + fix_invariant(); + } + + template <typename SeedSeq> + explicit Engine(SeedSeq &s +#if R123_USE_CXX11_TYPE_TRAITS + , typename std::enable_if<!std::is_convertible<SeedSeq, result_type>::value>::type* =0 +#endif + ) + : b(), c() { + ukey_type ukey = ukey_type::seed(s); + key = ukey; + v.back() = 0; + } + void seed(result_type r){ + *this = Engine(r); + } + template <typename SeedSeq> + void seed(SeedSeq &s +#if R123_USE_CXX11_TYPE_TRAITS + , typename std::enable_if<!std::is_convertible<SeedSeq, result_type>::value>::type* =0 +#endif + ){ + *this = Engine(s); + } + void seed(){ + *this = Engine(); + } + friend bool operator==(const Engine& lhs, const Engine& rhs){ + return lhs.c==rhs.c && lhs.v.back() == rhs.v.back() && lhs.key == rhs.key; + } + friend bool operator!=(const Engine& lhs, const Engine& rhs){ + return lhs.c!=rhs.c || lhs.v.back()!=rhs.v.back() || lhs.key!=rhs.key; + } + + friend std::ostream& operator<<(std::ostream& os, const Engine& be){ + return os << be.c << " " << be.key << " " << be.v.back(); + } + + friend std::istream& operator>>(std::istream& is, Engine& be){ + is >> be.c >> be.key >> be.v.back(); + be.fix_invariant(); + return is; + } + + // The <random> shipped with MacOS Xcode 4.5.2 imposes a + // non-standard requirement that URNGs also have static data + // members: _Min and _Max. Later versions of libc++ impose the + // requirement only when constexpr isn't supported. Although the + // Xcode 4.5.2 requirement is clearly non-standard, it is unlikely + // to be fixed and it is very easy work around. We certainly + // don't want to go to great lengths to accommodate every buggy + // library we come across, but in this particular case, the effort + // is low and the benefit is high, so it's worth doing. Thanks to + // Yan Zhou for pointing this out to us. See similar code in + // ../MicroURNG.hpp + const static result_type _Min = 0; + const static result_type _Max = ~((result_type)0); + + static R123_CONSTEXPR result_type min R123_NO_MACRO_SUBST () { return _Min; } + static R123_CONSTEXPR result_type max R123_NO_MACRO_SUBST () { return _Max; } + + result_type operator()(){ + if( c.size() == 1 ) // short-circuit the scalar case. Compilers aren't mind-readers. + return b(c.incr(), key)[0]; + result_type& elem = v.back(); + if( elem == 0 ){ + v = b(c.incr(), key); + result_type ret = v.back(); + elem = c.size()-1; + return ret; + } + return v[--elem]; + } + + void discard(R123_ULONG_LONG skip){ + // don't forget: elem counts down + size_t nelem = c.size(); + size_t sub = skip % nelem; + result_type& elem = v.back(); + skip /= nelem; + if (elem < sub) { + elem += nelem; + skip++; + } + elem -= sub; + c.incr(skip); + fix_invariant(); + } + + //-------------------------- + // Some bonus methods, not required for a Random Number + // Engine + + // Constructors and seed() method for ukey_type seem useful + // We need const and non-const to supersede the SeedSeq template. + explicit Engine(const ukey_type &uk) : key(uk), c(){ v.back() = 0; } + explicit Engine(ukey_type &uk) : key(uk), c(){ v.back() = 0; } + void seed(const ukey_type& uk){ + *this = Engine(uk); + } + void seed(ukey_type& uk){ + *this = Engine(uk); + } + +#if R123_USE_CXX11_TYPE_TRAITS + template <typename DUMMY=void> + explicit Engine(const key_type& k, + typename std::enable_if<!std::is_same<ukey_type, key_type>::value, DUMMY>::type* = 0) + : key(k), c(){ v.back() = 0; } + + template <typename DUMMY=void> + void seed(const key_type& k, + typename std::enable_if<!std::is_same<ukey_type, key_type>::value, DUMMY>::type* = 0){ + *this = Engine(k); + } +#endif + + // Forward the e(counter) to the CBRNG we are templated + // on, using the current value of the key. + ctr_type operator()(const ctr_type& c) const{ + return b(c, key); + } + + key_type getkey() const{ + return key; + } + + // N.B. setkey(k) is different from seed(k) because seed(k) zeros + // the counter (per the C++11 requirements for an Engine), whereas + // setkey does not. + void setkey(const key_type& k){ + key = k; + fix_invariant(); + } + + // Maybe the caller want's to know the details of + // the internal state, e.g., so it can call a different + // bijection with the same counter. + std::pair<ctr_type, result_type> getcounter() const { + return std::make_pair(c, v.back()); + } + + // And the inverse. + void setcounter(const ctr_type& _c, result_type _elem){ + static const size_t nelem = c.size(); + if( _elem >= nelem ) + throw std::range_error("Engine::setcounter called with elem out of range"); + c = _c; + v.back() = _elem; + fix_invariant(); + } + + void setcounter(const std::pair<ctr_type, result_type>& ce){ + setcounter(ce.first, ce.second); + } +}; +} // namespace r123 + +#endif diff --git a/ext/random123/include/Random123/conventional/gsl_cbrng.h b/ext/random123/include/Random123/conventional/gsl_cbrng.h new file mode 100644 index 0000000000000000000000000000000000000000..44457d002b0d40ec5319236f67bcd3ec2a3ce629 --- /dev/null +++ b/ext/random123/include/Random123/conventional/gsl_cbrng.h @@ -0,0 +1,128 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __r123_compat_gslrng_dot_h__ +#define __r123_compat_gslrng_dot_h__ + +#include <gsl/gsl_rng.h> +#include <string.h> + +/** + The macro: GSL_CBRNG(NAME, CBRNGNAME) + declares the necessary structs and constants that define a + gsl_rng_NAME type based on the counter-based RNG CBRNGNAME. For example: + + Usage: + + @code + #include <Random123/threefry.h> + #include <Random123/conventional/gsl_cbrng.h> // this file + GSL_CBRNG(cbrng, threefry4x32); // creates gsl_rng_cbrng + + int main(int argc, char **argv){ + gsl_rng *r = gsl_rng_alloc(gsl_rng_cbrng); + ... use r as you would use any other gsl_rng ... + } + @endcode + + It requires that NAME be the name of a CBRNG that follows the + naming and stylistic conventions of the Random123 library. + + Note that wrapping a \ref CBRNG "counter-based PRNG" with a traditional API in + this way obscures much of the power of the CBRNG API. + Nevertheless, it may be of value to applications that are already + coded to work with GSL random number generators, and that wish + to use the RNGs in the Random123 library. + + */ + +#define GSL_CBRNG(NAME, CBRNGNAME) \ +const gsl_rng_type *gsl_rng_##NAME; \ + \ +typedef struct{ \ + CBRNGNAME##_ctr_t ctr; \ + CBRNGNAME##_ctr_t r; \ + CBRNGNAME##_key_t key; \ + int elem; \ +} NAME##_state; \ + \ +static unsigned long int NAME##_get(void *vstate){ \ + NAME##_state *st = (NAME##_state *)vstate; \ + const int N=sizeof(st->ctr.v)/sizeof(st->ctr.v[0]); \ + if( st->elem == 0 ){ \ + ++st->ctr.v[0]; \ + if( N>1 && st->ctr.v[0] == 0 ) ++st->ctr.v[1]; \ + if( N>2 && st->ctr.v[1] == 0 ) ++st->ctr.v[2]; \ + if( N>3 && st->ctr.v[2] == 0 ) ++st->ctr.v[3]; \ + st->r = CBRNGNAME(st->ctr, st->key); \ + st->elem = N; \ + } \ + return 0xffffffffUL & st->r.v[--st->elem]; \ +} \ + \ +static double \ +NAME##_get_double (void * vstate) \ +{ \ + return NAME##_get (vstate)/4294967296.0; \ +} \ + \ +static void NAME##_set(void *vstate, unsigned long int s){ \ + NAME##_state *st = (NAME##_state *)vstate; \ + st->elem = 0; \ + /* Assume that key and ctr have an array member, v, \ + as if they are r123arrayNxW. If not, this will fail \ + to compile. In particular, this macro fails to compile \ + when the underlying CBRNG requires use of keyinit */ \ + memset(&st->ctr.v[0], 0, sizeof(st->ctr.v)); \ + memset(&st->key.v[0], 0, sizeof(st->key.v)); \ + /* GSL 1.15 documentation says this about gsl_rng_set: \ + Note that the most generators only accept 32-bit seeds, with higher \ + values being reduced modulo 2^32. For generators with smaller \ + ranges the maximum seed value will typically be lower. \ + so we won't jump through any hoops here to deal with \ + high bits if sizeof(unsigned long) > sizeof(uint32_t). */ \ + st->key.v[0] = s; \ +} \ + \ +static const gsl_rng_type NAME##_type = { \ + #NAME, \ + 0xffffffffUL, \ + 0, \ + sizeof(NAME##_state), \ + &NAME##_set, \ + &NAME##_get, \ + &NAME##_get_double \ +}; \ + \ +const gsl_rng_type *gsl_rng_##NAME = &NAME##_type + +#endif + diff --git a/ext/random123/include/Random123/features/clangfeatures.h b/ext/random123/include/Random123/features/clangfeatures.h new file mode 100644 index 0000000000000000000000000000000000000000..1e3c8cfdb9ccaf5e5c0491af373922e9855f1620 --- /dev/null +++ b/ext/random123/include/Random123/features/clangfeatures.h @@ -0,0 +1,93 @@ +/* +Copyright 2010-2016, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __clangfeatures_dot_hpp +#define __clangfeatures_dot_hpp + +#ifndef R123_USE_X86INTRIN_H +#if (defined(__x86_64__)||defined(__i386__)) +#define R123_USE_X86INTRIN_H 1 +#else +#define R123_USE_X86INTRIN_H 0 +#endif +#endif + +#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS +#define R123_USE_CXX11_UNRESTRICTED_UNIONS __has_feature(cxx_unrestricted_unions) +#endif + +#ifndef R123_USE_CXX11_STATIC_ASSERT +#define R123_USE_CXX11_STATIC_ASSERT __has_feature(cxx_static_assert) +#endif + +// With clang-3.6, -Wall warns about unused-local-typedefs. +// The "obvious" thing to do is to ignore -Wunused-local-typedefs, +// but that doesn't work because earlier versions of clang blow +// up on an 'unknown warning group'. So we briefly ignore -Wall... +// It's tempting to just give up on static assertions in pre-c++11 code. +#if !R123_USE_CXX11_STATIC_ASSERT && !defined(R123_STATIC_ASSERT) +#define R123_STATIC_ASSERT(expr, msg) \ +_Pragma("clang diagnostic push") \ +_Pragma("clang diagnostic ignored \"-Wall\"") \ +typedef char static_assertion[(!!(expr))*2-1] \ +_Pragma("clang diagnostic pop") +#endif + +#ifndef R123_USE_CXX11_CONSTEXPR +#define R123_USE_CXX11_CONSTEXPR __has_feature(cxx_constexpr) +#endif + +#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS +#define R123_USE_CXX11_EXPLICIT_CONVERSIONS __has_feature(cxx_explicit_conversions) +#endif + +// With clang-3.0, the apparently simpler: +// #define R123_USE_CXX11_RANDOM __has_include(<random>) +// dumps core. +#ifndef R123_USE_CXX11_RANDOM +#if __cplusplus>=201103L && __has_include(<random>) +#define R123_USE_CXX11_RANDOM 1 +#else +#define R123_USE_CXX11_RANDOM 0 +#endif +#endif + +#ifndef R123_USE_CXX11_TYPE_TRAITS +#if __cplusplus>=201103L && __has_include(<type_traits>) +#define R123_USE_CXX11_TYPE_TRAITS 1 +#else +#define R123_USE_CXX11_TYPE_TRAITS 0 +#endif +#endif + +#include "gccfeatures.h" + +#endif diff --git a/ext/random123/include/Random123/features/compilerfeatures.h b/ext/random123/include/Random123/features/compilerfeatures.h new file mode 100644 index 0000000000000000000000000000000000000000..2341a7a01ef53fd6add4a381588fa6a4e84029b5 --- /dev/null +++ b/ext/random123/include/Random123/features/compilerfeatures.h @@ -0,0 +1,343 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +/** + +@page porting Preprocessor symbols for porting Random123 to different platforms. + +The Random123 library is portable across C, C++, CUDA, OpenCL environments, +and multiple operating systems (Linux, Windows 7, Mac OS X, FreeBSD, Solaris). +This level of portability requires the abstraction of some features +and idioms that are either not standardized (e.g., asm statments), or for which +different vendors have their own standards (e.g., SSE intrinsics) or for +which vendors simply refuse to conform to well-established standards (e.g., <inttypes.h>). + +Random123/features/compilerfeatures.h +conditionally includes a compiler-or-OS-specific Random123/featires/XXXfeatures.h file which +defines appropriate values for the preprocessor symbols which can be used with +a specific compiler or OS. Those symbols will then +be used by other header files and source files in the Random123 +library (and may be used by applications) to control what actually +gets presented to the compiler. + +Most of the symbols are boolean valued. In general, they will +\b always be defined with value either 1 or 0, so do +\b NOT use \#ifdef. Use \#if R123_USE_SOMETHING instead. + +Library users can override any value by defining the pp-symbol with a compiler option, +e.g., + + cc -DR123_USE_MULHILO64_C99 + +will use a strictly c99 version of the full-width 64x64->128-bit multiplication +function, even if it would be disabled by default. + +All boolean-valued pre-processor symbols in Random123/features/compilerfeatures.h start with the prefix R123_USE_ +@verbatim + AES_NI + AES_OPENSSL + SSE4_2 + SSE4_1 + SSE + + STD_RANDOM + + GNU_UINT128 + ASM_GNU + ASM_MSASM + + CPUID_MSVC + + CXX11_RANDOM + CXX11_TYPE_TRAITS + CXX11_STATIC_ASSERT + CXX11_CONSTEXPR + CXX11_UNRESTRICTED_UNIONS + CXX11_EXPLICIT_CONVERSIONS + CXX11_LONG_LONG + CXX11_STD_ARRAY + CXX11 + + X86INTRIN_H + IA32INTRIN_H + XMMINTRIN_H + EMMINTRIN_H + SMMINTRIN_H + WMMINTRIN_H + INTRIN_H + + MULHILO32_ASM + MULHILO64_ASM + MULHILO64_MSVC_INTRIN + MULHILO64_CUDA_INTRIN + MULHILO64_OPENCL_INTRIN + MULHILO64_C99 + + U01_DOUBLE + +@endverbatim +Most have obvious meanings. Some non-obvious ones: + +AES_NI and AES_OPENSSL are not mutually exclusive. You can have one, +both or neither. + +GNU_UINT128 says that it's safe to use __uint128_t, but it +does not require its use. In particular, it should be +used in mulhilo<uint64_t> only if MULHILO64_ASM is unset. + +If the XXXINTRIN_H macros are true, then one should +@code +#include <xxxintrin.h> +@endcode +to gain accesss to compiler intrinsics. + +The CXX11_SOME_FEATURE macros allow the code to use specific +features of the C++11 language and library. The catchall +In the absence of a specific CXX11_SOME_FEATURE, the feature +is controlled by the catch-all R123_USE_CXX11 macro. + +U01_DOUBLE defaults on, and can be turned off (set to 0) +if one does not want the utility functions that convert to double +(i.e. u01_*_53()), e.g. on OpenCL without the cl_khr_fp64 extension. + +There are a number of invariants that are always true. Application code may +choose to rely on these: + +<ul> +<li>ASM_GNU and ASM_MASM are mutually exclusive +<li>The "higher" SSE values imply the lower ones. +</ul> + +There are also non-boolean valued symbols: + +<ul> +<li>R123_STATIC_INLINE - + According to both C99 and GNU99, the 'static inline' declaration allows + the compiler to not emit code if the function is not used. + Note that the semantics of 'inline', 'static' and 'extern' in + gcc have changed over time and are subject to modification by + command line options, e.g., -std=gnu89, -fgnu-inline. + Nevertheless, it appears that the meaning of 'static inline' + has not changed over time and (with a little luck) the use of 'static inline' + here will be portable between versions of gcc and to other C99 + compilers. + See: http://gcc.gnu.org/onlinedocs/gcc/Inline.html + http://www.greenend.org.uk/rjk/2003/03/inline.html + +<li>R123_FORCE_INLINE(decl) - + which expands to 'decl', adorned with the compiler-specific + embellishments to strongly encourage that the declared function be + inlined. If there is no such compiler-specific magic, it should + expand to decl, unadorned. + +<li>R123_CUDA_DEVICE - which expands to __device__ (or something else with + sufficiently similar semantics) when CUDA is in use, and expands + to nothing in other cases. + +<li>R123_METAL_THREAD_ADDRESS_SPACE - which expands to 'thread' (or + something else with sufficiently similar semantics) when compiling a + Metal kernel, and expands to nothing in other cases. + +<li>R123_ASSERT(x) - which expands to assert(x), or maybe to nothing at + all if we're in an environment so feature-poor that you can't even + call assert (I'm looking at you, CUDA and OpenCL), or even include + assert.h safely (OpenCL). + +<li>R123_STATIC_ASSERT(expr,msg) - which expands to + static_assert(expr,msg), or to an expression that + will raise a compile-time exception if expr is not true. + +<li>R123_ULONG_LONG - which expands to a declaration of the longest available + unsigned integer. + +<li>R123_64BIT(x) - expands to something equivalent to + UINT64_C(x) from <stdint.h>, even in environments where <stdint.h> + is not available, e.g., MSVC and OpenCL. + +<li>R123_BUILTIN_EXPECT(expr,likely_value) - expands to something with + the semantics of gcc's __builtin_expect(expr,likely_value). If + the environment has nothing like __builtin_expect, it should expand + to just expr. +</ul> + + +\cond HIDDEN_FROM_DOXYGEN +*/ + +/* +N.B. When something is added to the list of features, it should be +added to each of the *features.h files, AND to examples/ut_features.cpp. +*/ + +/* N.B. most other compilers (icc, nvcc, open64, llvm) will also define __GNUC__, so order matters. */ +#if defined(__METAL_MACOS__) +#include "metalfeatures.h" +#elif defined(__OPENCL_VERSION__) && __OPENCL_VERSION__ > 0 +#include "openclfeatures.h" +#elif defined(__CUDACC__) +#include "nvccfeatures.h" +#elif defined(__ICC) +#include "iccfeatures.h" +#elif defined(__xlC__) +#include "xlcfeatures.h" +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) +#include "sunprofeatures.h" +#elif defined(__OPEN64__) +#include "open64features.h" +#elif defined(__clang__) +#include "clangfeatures.h" +#elif defined(__GNUC__) +#include "gccfeatures.h" +#elif defined(__PGI) +#include "pgccfeatures.h" +#elif defined(_MSC_FULL_VER) +#include "msvcfeatures.h" +#else +#error "Can't identify compiler. You'll need to add a new xxfeatures.hpp" +{ /* maybe an unbalanced brace will terminate the compilation */ +#endif + +#ifndef R123_USE_CXX11 +#define R123_USE_CXX11 (__cplusplus >= 201103L) +#endif + +#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS +#define R123_USE_CXX11_UNRESTRICTED_UNIONS R123_USE_CXX11 +#endif + +#ifndef R123_USE_CXX11_STATIC_ASSERT +#define R123_USE_CXX11_STATIC_ASSERT R123_USE_CXX11 +#endif + +#ifndef R123_USE_CXX11_CONSTEXPR +#define R123_USE_CXX11_CONSTEXPR R123_USE_CXX11 +#endif + +#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS +#define R123_USE_CXX11_EXPLICIT_CONVERSIONS R123_USE_CXX11 +#endif + +#ifndef R123_USE_CXX11_RANDOM +#define R123_USE_CXX11_RANDOM R123_USE_CXX11 +#endif + +#ifndef R123_USE_CXX11_TYPE_TRAITS +#define R123_USE_CXX11_TYPE_TRAITS R123_USE_CXX11 +#endif + +#ifndef R123_USE_CXX11_LONG_LONG +#define R123_USE_CXX11_LONG_LONG R123_USE_CXX11 +#endif + +#ifndef R123_USE_CXX11_STD_ARRAY +#define R123_USE_CXX11_STD_ARRAY R123_USE_CXX11 +#endif + +#ifndef R123_USE_MULHILO64_C99 +#define R123_USE_MULHILO64_C99 0 +#endif + +#ifndef R123_USE_MULHILO64_MULHI_INTRIN +#define R123_USE_MULHILO64_MULHI_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO32_MULHI_INTRIN +#define R123_USE_MULHILO32_MULHI_INTRIN 0 +#endif + +#ifndef R123_STATIC_ASSERT +#if R123_USE_CXX11_STATIC_ASSERT +#define R123_STATIC_ASSERT(expr, msg) static_assert(expr, msg) +#else + /* if msg always_looked_like_this, we could paste it into the name. Worth it? */ +#define R123_STATIC_ASSERT(expr, msg) typedef char static_assertion[(!!(expr))*2-1] +#endif +#endif + +#ifndef R123_CONSTEXPR +#if R123_USE_CXX11_CONSTEXPR +#define R123_CONSTEXPR constexpr +#else +#define R123_CONSTEXPR +#endif +#endif + +#ifndef R123_USE_64BIT +#define R123_USE_64BIT 1 +#endif + +#ifndef R123_USE_PHILOX_64BIT +#define R123_USE_PHILOX_64BIT (R123_USE_64BIT && (R123_USE_MULHILO64_ASM || R123_USE_MULHILO64_MSVC_INTRIN || R123_USE_MULHILO64_CUDA_INTRIN || R123_USE_GNU_UINT128 || R123_USE_MULHILO64_C99 || R123_USE_MULHILO64_OPENCL_INTRIN || R123_USE_MULHILO64_MULHI_INTRIN)) +#endif + +#ifndef R123_ULONG_LONG +#if defined(__cplusplus) && !R123_USE_CXX11_LONG_LONG +/* C++98 doesn't have long long. It doesn't have uint64_t either, but + we will have typedef'ed uint64_t to something in the xxxfeatures.h. + With luck, it won't elicit complaints from -pedantic. Cross your + fingers... */ +#define R123_ULONG_LONG uint64_t +#else +#define R123_ULONG_LONG unsigned long long +#endif +#endif + +/* UINT64_C should have been #defined by XXXfeatures.h, either by + #include <stdint.h> or through compiler-dependent hacks */ +#ifndef R123_64BIT +#define R123_64BIT(x) UINT64_C(x) +#endif + +#ifndef R123_THROW +#define R123_THROW(x) throw (x) +#endif + +#ifndef R123_METAL_THREAD_ADDRESS_SPACE +#define R123_METAL_THREAD_ADDRESS_SPACE +#endif + +#ifndef R123_METAL_CONSTANT_ADDRESS_SPACE +#define R123_METAL_CONSTANT_ADDRESS_SPACE +#endif + +/* + * Windows.h (and perhaps other "well-meaning" code define min and + * max, so there's a high chance that our definition of min, max + * methods or use of std::numeric_limits min and max will cause + * complaints in any program that happened to include Windows.h or + * suchlike first. We use the null macro below in our own header + * files definition or use of min, max to defensively preclude + * this problem. It may not be enough; one might need to #define + * NOMINMAX before including Windows.h or compile with -DNOMINMAX. + */ +#define R123_NO_MACRO_SUBST + +/** \endcond */ diff --git a/ext/random123/include/Random123/features/gccfeatures.h b/ext/random123/include/Random123/features/gccfeatures.h new file mode 100644 index 0000000000000000000000000000000000000000..701f3c667801421d7638d6cf7474f5bc461f8411 --- /dev/null +++ b/ext/random123/include/Random123/features/gccfeatures.h @@ -0,0 +1,263 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __gccfeatures_dot_hpp +#define __gccfeatures_dot_hpp + +#define R123_GNUC_VERSION (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__) + +#if !defined(__x86_64__) && !defined(__i386__) && !defined(__powerpc__) && !defined(__arm__) && !defined(__aarch64__) +# error "This code has only been tested on x86, powerpc and a few arm platforms." +#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task> +{ /* maybe an unbalanced brace will terminate the compilation */ + /* Feel free to try the Random123 library on other architectures by changing + the conditions that reach this error, but you should consider it a + porting exercise and expect to encounter bugs and deficiencies. + Please let the authors know of any successes (or failures). */ +#endif + +#ifdef __powerpc__ +#include <ppu_intrinsics.h> +#endif + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE static __inline__ +#endif + +#ifndef R123_FORCE_INLINE +#if R123_GNUC_VERSION >= 40000 +#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline)) +#else +#define R123_FORCE_INLINE(decl) decl +#endif +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#include <assert.h> +#define R123_ASSERT(x) assert(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely) +#endif + +/* According to the C++0x standard, we should be able to test the numeric + value of __cplusplus == 199701L for C++98, __cplusplus == 201103L for C++11 + But gcc has had an open bug http://gcc.gnu.org/bugzilla/show_bug.cgi?id=1773 + since early 2001, which was finally fixed in 4.7 (early 2012). For + earlier versions, the only way to detect whether --std=c++0x was requested + on the command line is to look at the __GCC_EXPERIMENTAL_CXX0X__ pp-symbol. +*/ +#if defined(__GCC_EXPERIMENTAL_CXX0X__) +#define GNU_CXX11 (__cplusplus>=201103L || (R123_GNUC_VERSION<40700 && 1/* defined(__GCC_EXPERIMENTAL_CXX0X__) */)) +#else +#define GNU_CXX11 (__cplusplus>=201103L || (R123_GNUC_VERSION<40700 && 0/* defined(__GCC_EXPERIMENTAL_CXX0X__) */)) +#endif + +#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS +#define R123_USE_CXX11_UNRESTRICTED_UNIONS ((R123_GNUC_VERSION >= 40600) && GNU_CXX11) +#endif + +#ifndef R123_USE_CXX11_STATIC_ASSERT +#define R123_USE_CXX11_STATIC_ASSERT ((R123_GNUC_VERSION >= 40300) && GNU_CXX11) +#endif + +#ifndef R123_USE_CXX11_CONSTEXPR +#define R123_USE_CXX11_CONSTEXPR ((R123_GNUC_VERSION >= 40600) && GNU_CXX11) +#endif + +#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS +#define R123_USE_CXX11_EXPLICIT_CONVERSIONS ((R123_GNUC_VERSION >= 40500) && GNU_CXX11) +#endif + +#ifndef R123_USE_CXX11_RANDOM +#define R123_USE_CXX11_RANDOM ((R123_GNUC_VERSION>=40500) && GNU_CXX11) +#endif + +#ifndef R123_USE_CXX11_TYPE_TRAITS +#define R123_USE_CXX11_TYPE_TRAITS ((R123_GNUC_VERSION>=40400) && GNU_CXX11) +#endif + +#ifndef R123_USE_AES_NI +#ifdef __AES__ +#define R123_USE_AES_NI 1 +#else +#define R123_USE_AES_NI 0 +#endif +#endif + +#ifndef R123_USE_SSE4_2 +#ifdef __SSE4_2__ +#define R123_USE_SSE4_2 1 +#else +#define R123_USE_SSE4_2 0 +#endif +#endif + +#ifndef R123_USE_SSE4_1 +#ifdef __SSE4_1__ +#define R123_USE_SSE4_1 1 +#else +#define R123_USE_SSE4_1 0 +#endif +#endif + +#ifndef R123_USE_SSE +/* There's no point in trying to compile SSE code in Random123 + unless SSE2 is available. */ +#ifdef __SSE2__ +#define R123_USE_SSE 1 +#else +#define R123_USE_SSE 0 +#endif +#endif + +#ifndef R123_USE_AES_OPENSSL +/* There isn't really a good way to tell at compile time whether + openssl is available. Without a pre-compilation configure-like + tool, it's less error-prone to guess that it isn't available. Add + -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to + play with openssl */ +#define R123_USE_AES_OPENSSL 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#if defined(__x86_64__) || defined(__aarch64__) +#define R123_USE_GNU_UINT128 1 +#else +#define R123_USE_GNU_UINT128 0 +#endif +#endif + +#ifndef R123_USE_ASM_GNU +#if (defined(__x86_64__)||defined(__i386__)) +#define R123_USE_ASM_GNU 1 +#else +#define R123_USE_ASM_GNU 1 +#endif +#endif + +#ifndef R123_USE_CPUID_MSVC +#define R123_USE_CPUID_MSVC 0 +#endif + +#ifndef R123_USE_X86INTRIN_H +#if (defined(__x86_64__)||defined(__i386__)) +#define R123_USE_X86INTRIN_H (1/* (defined(__x86_64__)||defined(__i386__)) */ && R123_GNUC_VERSION >= 40402) +#else +#define R123_USE_X86INTRIN_H (0/* (defined(__x86_64__)||defined(__i386__)) */ && R123_GNUC_VERSION >= 40402) +#endif +#endif + +#ifndef R123_USE_IA32INTRIN_H +#define R123_USE_IA32INTRIN_H 0 +#endif + +#ifndef R123_USE_XMMINTRIN_H +#define R123_USE_XMMINTRIN_H 0 +#endif + +#ifndef R123_USE_EMMINTRIN_H +/* gcc -m64 on Solaris 10 defines __SSE2__ but doesn't have + emmintrin.h in the include search path. This is + so broken that I refuse to try to work around it. If this + affects you, figure out where your emmintrin.h lives and + add an appropriate -I to your CPPFLAGS. Or add -DR123_USE_SSE=0. */ +#define R123_USE_EMMINTRIN_H (R123_USE_SSE && (R123_GNUC_VERSION < 40402)) +#endif + +#ifndef R123_USE_SMMINTRIN_H +#define R123_USE_SMMINTRIN_H ((R123_USE_SSE4_1 || R123_USE_SSE4_2) && (R123_GNUC_VERSION < 40402)) +#endif + +#ifndef R123_USE_WMMINTRIN_H +#define R123_USE_WMMINTRIN_H 0 +#endif + +#ifndef R123_USE_INTRIN_H +#define R123_USE_INTRIN_H 0 +#endif + +#ifndef R123_USE_MULHILO32_ASM +#define R123_USE_MULHILO32_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_MULHI_INTRIN +#if (defined(__powerpc64__)) +#define R123_USE_MULHILO64_MULHI_INTRIN 1 +#else +#define R123_USE_MULHILO64_MULHI_INTRIN 0 +#endif +#endif + +#ifndef R123_MULHILO64_MULHI_INTRIN +#define R123_MULHILO64_MULHI_INTRIN __mulhdu +#endif + +#ifndef R123_USE_MULHILO32_MULHI_INTRIN +#define R123_USE_MULHILO32_MULHI_INTRIN 0 +#endif + +#ifndef R123_MULHILO32_MULHI_INTRIN +#define R123_MULHILO32_MULHI_INTRIN __mulhwu +#endif + +#ifndef __STDC_CONSTANT_MACROS +#define __STDC_CONSTANT_MACROS +#endif +#include <stdint.h> +#ifndef UINT64_C +#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h> +#endif + +/* If you add something, it must go in all the other XXfeatures.hpp + and in ../ut_features.cpp */ +#endif diff --git a/ext/random123/include/Random123/features/iccfeatures.h b/ext/random123/include/Random123/features/iccfeatures.h new file mode 100644 index 0000000000000000000000000000000000000000..7e72dec1d31027eac66e00fe92b4cd926b235d23 --- /dev/null +++ b/ext/random123/include/Random123/features/iccfeatures.h @@ -0,0 +1,212 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __icpcfeatures_dot_hpp +#define __icpcfeatures_dot_hpp + +// icc relies on gcc libraries and other toolchain components. +#define R123_GNUC_VERSION (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__) + +#if !defined(__x86_64__) && !defined(__i386__) +# error "This code has only been tested on x86 platforms." +{ // maybe an unbalanced brace will terminate the compilation +// You are invited to try Easy123 on other architectures, by changing +// the conditions that reach this error, but you should consider it a +// porting exercise and expect to encounter bugs and deficiencies. +// Please let the authors know of any successes (or failures). +#endif + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE static inline +#endif + +#ifndef R123_FORCE_INLINE +#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline)) +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#include <assert.h> +#define R123_ASSERT(x) assert(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely) +#endif + +// The basic idiom is: +// #ifndef R123_SOMETHING +// #if some condition +// #define R123_SOMETHING 1 +// #else +// #define R123_SOMETHING 0 +// #endif +// #endif +// This idiom allows an external user to override any decision +// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0 + +// An alternative idiom is: +// #ifndef R123_SOMETHING +// #define R123_SOMETHING (some boolean expression) +// #endif +// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE +// pp-symbols. + +#ifndef R123_USE_SSE4_2 +#ifdef __SSE4_2__ +#define R123_USE_SSE4_2 1 +#else +#define R123_USE_SSE4_2 0 +#endif +#endif + +#ifndef R123_USE_SSE4_1 +#ifdef __SSE4_1__ +#define R123_USE_SSE4_1 1 +#else +#define R123_USE_SSE4_1 0 +#endif +#endif + +#ifndef R123_USE_SSE +#ifdef __SSE2__ +#define R123_USE_SSE 1 +#else +#define R123_USE_SSE 0 +#endif +#endif + +#ifndef R123_USE_AES_NI +// Unlike gcc, icc (version 12) does not pre-define an __AES__ +// pp-symbol when -maes or -xHost is on the command line. This feels +// like a defect in icc (it defines __SSE4_2__ in analogous +// circumstances), but until Intel fixes it, we're better off erring +// on the side of caution and not generating instructions that are +// going to raise SIGILL when executed. To get the AES-NI +// instructions with icc, the caller must puts something like +// -DR123_USE_AES_NI=1 or -D__AES__ on the command line. FWIW, the +// AES-NI Whitepaper by Gueron says that icc has supported AES-NI from +// 11.1 onwards. +// +#if defined(__AES__) +#define R123_USE_AES_NI ((__ICC>=1101) && 1/*defined(__AES__)*/) +#else +#define R123_USE_AES_NI ((__ICC>=1101) && 0/*defined(__AES__)*/) +#endif +#endif + +#ifndef R123_USE_AES_OPENSSL +/* There isn't really a good way to tell at compile time whether + openssl is available. Without a pre-compilation configure-like + tool, it's less error-prone to guess that it isn't available. Add + -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to + play with openssl */ +#define R123_USE_AES_OPENSSL 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_ASM_GNU +#define R123_USE_ASM_GNU 1 +#endif + +#ifndef R123_USE_CPUID_MSVC +#define R123_USE_CPUID_MSVC 0 +#endif + +#ifndef R123_USE_X86INTRIN_H +#define R123_USE_X86INTRIN_H 0 +#endif + +#ifndef R123_USE_IA32INTRIN_H +#define R123_USE_IA32INTRIN_H 1 +#endif + +#ifndef R123_USE_XMMINTRIN_H +#define R123_USE_XMMINTRIN_H 0 +#endif + +#ifndef R123_USE_EMMINTRIN_H +#define R123_USE_EMMINTRIN_H 1 +#endif + +#ifndef R123_USE_SMMINTRIN_H +#define R123_USE_SMMINTRIN_H 1 +#endif + +#ifndef R123_USE_WMMINTRIN_H +#define R123_USE_WMMINTRIN_H 1 +#endif + +#ifndef R123_USE_INTRIN_H +#define R123_USE_INTRIN_H 0 +#endif + +#ifndef R123_USE_MULHILO16_ASM +#define R123_USE_MULHILO16_ASM 0 +#endif + +#ifndef R123_USE_MULHILO32_ASM +#define R123_USE_MULHILO32_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 1 +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 0 +#endif + +#ifndef __STDC_CONSTANT_MACROS +#define __STDC_CONSTANT_MACROS +#endif +#include <stdint.h> +#ifndef UINT64_C +#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h> +#endif + +// If you add something, it must go in all the other XXfeatures.hpp +// and in ../ut_features.cpp +#endif diff --git a/ext/random123/include/Random123/features/metalfeatures.h b/ext/random123/include/Random123/features/metalfeatures.h new file mode 100644 index 0000000000000000000000000000000000000000..bafe51a6840cb3ae5ca7fc1145cf5442db0657e5 --- /dev/null +++ b/ext/random123/include/Random123/features/metalfeatures.h @@ -0,0 +1,111 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + * Written by Tom Schoonjans <Tom.Schoonjans@me.com> + */ + +#ifndef __metalfeatures_dot_hpp +#define __metalfeatures_dot_hpp + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE inline +#endif + +#ifndef R123_FORCE_INLINE +#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline)) +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_METAL_THREAD_ADDRESS_SPACE +#define R123_METAL_THREAD_ADDRESS_SPACE thread +#endif + +#ifndef R123_METAL_CONSTANT_ADDRESS_SPACE +#define R123_METAL_CONSTANT_ADDRESS_SPACE constant +#endif + +#ifndef R123_ASSERT +#define R123_ASSERT(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) expr +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO32_MULHI_INTRIN +#define R123_USE_MULHILO32_MULHI_INTRIN 1 +#endif + +#if R123_USE_MULHILO32_MULHI_INTRIN +#include <metal_integer> +#define R123_MULHILO32_MULHI_INTRIN metal::mulhi +#endif + +#ifndef R123_USE_AES_NI +#define R123_USE_AES_NI 0 +#endif + +#ifndef R123_USE_64BIT +#define R123_USE_64BIT 0 /* Metal currently (Feb 2019, Specification-2) does not support 64-bit variable types */ +#endif + +#ifndef R123_ULONG_LONG +/* the longest integer type in Metal (Feb 2019, Specification-2) is a + * 32-bit unsigned int. Let's hope for the best... */ +#define R123_ULONG_LONG unsigned int +#endif + +#endif diff --git a/ext/random123/include/Random123/features/msvcfeatures.h b/ext/random123/include/Random123/features/msvcfeatures.h new file mode 100644 index 0000000000000000000000000000000000000000..9eb9520912daf66869a6cf9fd027c37f06a8a3d4 --- /dev/null +++ b/ext/random123/include/Random123/features/msvcfeatures.h @@ -0,0 +1,200 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __msvcfeatures_dot_hpp +#define __msvcfeatures_dot_hpp + +//#if _MSVC_FULL_VER <= 15 +//#error "We've only tested MSVC_FULL_VER==15." +//#endif + +#if !defined(_M_IX86) && !defined(_M_X64) +# error "This code has only been tested on x86 platforms." +{ // maybe an unbalanced brace will terminate the compilation +// You are invited to try Random123 on other architectures, by changing +// the conditions that reach this error, but you should consider it a +// porting exercise and expect to encounter bugs and deficiencies. +// Please let the authors know of any successes (or failures). +#endif + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE static __inline +#endif + +#ifndef R123_FORCE_INLINE +#define R123_FORCE_INLINE(decl) _forceinline decl +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#include <assert.h> +#define R123_ASSERT(x) assert(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) expr +#endif + +// The basic idiom is: +// #ifndef R123_SOMETHING +// #if some condition +// #define R123_SOMETHING 1 +// #else +// #define R123_SOMETHING 0 +// #endif +// #endif +// This idiom allows an external user to override any decision +// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0 + +// An alternative idiom is: +// #ifndef R123_SOMETHING +// #define R123_SOMETHING (some boolean expression) +// #endif +// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE +// pp-symbols. + +#ifndef R123_USE_AES_NI +#if defined(_M_X64) +#define R123_USE_AES_NI 1 +#else +#define R123_USE_AES_NI 0 +#endif +#endif + +#ifndef R123_USE_SSE4_2 +#if defined(_M_X64) +#define R123_USE_SSE4_2 1 +#else +#define R123_USE_SSE4_2 0 +#endif +#endif + +#ifndef R123_USE_SSE4_1 +#if defined(_M_X64) +#define R123_USE_SSE4_1 1 +#else +#define R123_USE_SSE4_1 0 +#endif +#endif + +#ifndef R123_USE_SSE +#define R123_USE_SSE 1 +#endif + +#ifndef R123_USE_AES_OPENSSL +#define R123_USE_AES_OPENSSL 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_ASM_GNU +#define R123_USE_ASM_GNU 0 +#endif + +#ifndef R123_USE_CPUID_MSVC +#define R123_USE_CPUID_MSVC 1 +#endif + +#ifndef R123_USE_X86INTRIN_H +#define R123_USE_X86INTRIN_H 0 +#endif + +#ifndef R123_USE_IA32INTRIN_H +#define R123_USE_IA32INTRIN_H 0 +#endif + +#ifndef R123_USE_XMMINTRIN_H +#define R123_USE_XMMINTRIN_H 0 +#endif + +#ifndef R123_USE_EMMINTRIN_H +#define R123_USE_EMMINTRIN_H 1 +#endif + +#ifndef R123_USE_SMMINTRIN_H +#define R123_USE_SMMINTRIN_H 1 +#endif + +#ifndef R123_USE_WMMINTRIN_H +#define R123_USE_WMMINTRIN_H 1 +#endif + +#ifndef R123_USE_INTRIN_H +#define R123_USE_INTRIN_H 1 +#endif + +#ifndef R123_USE_MULHILO16_ASM +#define R123_USE_MULHILO16_ASM 0 +#endif + +#ifndef R123_USE_MULHILO32_ASM +#define R123_USE_MULHILO32_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#if defined(_M_X64) +#define R123_USE_MULHILO64_MSVC_INTRIN 1 +#else +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 0 +#endif + +#ifndef __STDC_CONSTANT_MACROS +#define __STDC_CONSTANT_MACROS +#endif +#include <stdint.h> +#ifndef UINT64_C +#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h> +#endif + +#pragma warning(disable:4244) +#pragma warning(disable:4996) + +// If you add something, it must go in all the other XXfeatures.hpp +// and in ../ut_features.cpp +#endif diff --git a/ext/random123/include/Random123/features/nvccfeatures.h b/ext/random123/include/Random123/features/nvccfeatures.h new file mode 100644 index 0000000000000000000000000000000000000000..d1ff8bf521a05d45232e922c9a875439bc84f837 --- /dev/null +++ b/ext/random123/include/Random123/features/nvccfeatures.h @@ -0,0 +1,125 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __r123_nvcc_features_dot_h__ +#define __r123_nvcc_features_dot_h__ + +#if !defined(CUDART_VERSION) +#error "why are we in nvccfeatures.h if CUDART_VERSION is not defined" +#endif + +#if CUDART_VERSION < 4010 +#error "CUDA versions earlier than 4.1 produce incorrect results for some templated functions in namespaces. Random123 isunsupported. See comments in nvccfeatures.h" +// This test was added in Random123-1.08 (August, 2013) because we +// discovered that Ftype(maxTvalue<T>()) with Ftype=double and +// T=uint64_t in examples/uniform.hpp produces -1 for CUDA4.0 and +// earlier. We can't be sure this bug doesn't also affect invocations +// of other templated functions, e.g., essentially all of Random123. +// Thus, we no longer trust CUDA versions earlier than 4.1 even though +// we had previously tested and timed Random123 with CUDA 3.x and 4.0. +// If you feel lucky or desperate, you can change #error to #warning, but +// please take extra care to be sure that you are getting correct +// results. +#endif + +// nvcc falls through to gcc or msvc. So first define +// a couple of things and then include either gccfeatures.h +// or msvcfeatures.h + +//#ifdef __CUDA_ARCH__ allows Philox32 and Philox64 to be compiled +//for both device and host functions in CUDA by setting compiler flags +//for the device function +#ifdef __CUDA_ARCH__ +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE __device__ +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 1 +#endif + +#ifndef R123_THROW +// No exceptions in CUDA, at least upto 4.0 +#define R123_THROW(x) R123_ASSERT(0) +#endif + +#ifndef R123_ASSERT +#define R123_ASSERT(x) if((x)) ; else asm("trap;") +#endif + +#else // ! __CUDA_ARCH__ +// If we're using nvcc not compiling for the CUDA architecture, +// then we must be compiling for the host. In that case, +// tell the philox code to use the mulhilo64 asm because +// nvcc doesn't grok uint128_t. +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 1 +#endif + +#endif // __CUDA_ARCH__ + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) expr +#endif + +#ifndef R123_USE_AES_NI +#define R123_USE_AES_NI 0 +#endif + +#ifndef R123_USE_SSE4_2 +#define R123_USE_SSE4_2 0 +#endif + +#ifndef R123_USE_SSE4_1 +#define R123_USE_SSE4_1 0 +#endif + +#ifndef R123_USE_SSE +#define R123_USE_SSE 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_ULONG_LONG +// uint64_t, which is what we'd get without this, is +// not the same as unsigned long long +#define R123_ULONG_LONG unsigned long long +#endif + +#if defined(__GNUC__) +#include "gccfeatures.h" +#elif defined(_MSC_FULL_VER) +#include "msvcfeatures.h" +#endif + +#endif diff --git a/ext/random123/include/Random123/features/open64features.h b/ext/random123/include/Random123/features/open64features.h new file mode 100644 index 0000000000000000000000000000000000000000..8da9f5f51efab021c644b632b4499f12fa0220d9 --- /dev/null +++ b/ext/random123/include/Random123/features/open64features.h @@ -0,0 +1,50 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __open64features_dot_hpp +#define __open64features_dot_hpp + +/* The gcc features are mostly right. We just override a few and then include gccfeatures.h */ + +/* Open64 4.2.3 and 4.2.4 accept the __uint128_t code without complaint + but produce incorrect code for 64-bit philox. The MULHILO64_ASM + seems to work fine */ +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 1 +#endif + +#include "gccfeatures.h" + +#endif diff --git a/ext/random123/include/Random123/features/openclfeatures.h b/ext/random123/include/Random123/features/openclfeatures.h new file mode 100644 index 0000000000000000000000000000000000000000..af03d3092318c6c27f1a65ce8104c1609b1e66e1 --- /dev/null +++ b/ext/random123/include/Random123/features/openclfeatures.h @@ -0,0 +1,89 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __openclfeatures_dot_hpp +#define __openclfeatures_dot_hpp + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE inline +#endif + +#ifndef R123_FORCE_INLINE +#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline)) +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#define R123_ASSERT(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) expr +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 1 +#endif + +#ifndef R123_USE_AES_NI +#define R123_USE_AES_NI 0 +#endif + +// XXX ATI APP SDK 2.4 clBuildProgram SEGVs if one uses uint64_t instead of +// ulong to mul_hi. And gets lots of complaints from stdint.h +// on some machines. +// But these typedefs mean we cannot include stdint.h with +// these headers? Do we need R123_64T, R123_32T, R123_8T? +typedef ulong uint64_t; +typedef uint uint32_t; +typedef uchar uint8_t; +#define UINT64_C(x) ((ulong)(x##UL)) + +#endif diff --git a/ext/random123/include/Random123/features/pgccfeatures.h b/ext/random123/include/Random123/features/pgccfeatures.h new file mode 100644 index 0000000000000000000000000000000000000000..18ace1353b4e0e6201c823e17b5325c2a9b05afe --- /dev/null +++ b/ext/random123/include/Random123/features/pgccfeatures.h @@ -0,0 +1,194 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Copyright (c) 2013, Los Alamos National Security, LLC +All rights reserved. + +Copyright 2013. Los Alamos National Security, LLC. This software was produced +under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National +Laboratory (LANL), which is operated by Los Alamos National Security, LLC for +the U.S. Department of Energy. The U.S. Government has rights to use, +reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS +ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR +ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified +to produce derivative works, such modified software should be clearly marked, +so as not to confuse it with the version available from LANL. +*/ +#ifndef __pgccfeatures_dot_hpp +#define __pgccfeatures_dot_hpp + +#if !defined(__x86_64__) && !defined(__i386__) +# error "This code has only been tested on x86 platforms." +#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task> +{ /* maybe an unbalanced brace will terminate the compilation */ + /* Feel free to try the Random123 library on other architectures by changing + the conditions that reach this error, but you should consider it a + porting exercise and expect to encounter bugs and deficiencies. + Please let the authors know of any successes (or failures). */ +#endif + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE static inline +#endif + +/* Found this example in PGI's emmintrin.h. */ +#ifndef R123_FORCE_INLINE +#define R123_FORCE_INLINE(decl) decl __attribute__((__always_inline__)) +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#include <assert.h> +#define R123_ASSERT(x) assert(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) (expr) +#endif + +/* PGI through 13.2 doesn't appear to support AES-NI. */ +#ifndef R123_USE_AES_NI +#define R123_USE_AES_NI 0 +#endif + +/* PGI through 13.2 appears to support MMX, SSE, SSE3, SSE3, SSSE3, SSE4a, and + ABM, but not SSE4.1 or SSE4.2. */ +#ifndef R123_USE_SSE4_2 +#define R123_USE_SSE4_2 0 +#endif + +#ifndef R123_USE_SSE4_1 +#define R123_USE_SSE4_1 0 +#endif + +#ifndef R123_USE_SSE +/* There's no point in trying to compile SSE code in Random123 + unless SSE2 is available. */ +#ifdef __SSE2__ +#define R123_USE_SSE 1 +#else +#define R123_USE_SSE 0 +#endif +#endif + +#ifndef R123_USE_AES_OPENSSL +/* There isn't really a good way to tell at compile time whether + openssl is available. Without a pre-compilation configure-like + tool, it's less error-prone to guess that it isn't available. Add + -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to + play with openssl */ +#define R123_USE_AES_OPENSSL 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_ASM_GNU +#define R123_USE_ASM_GNU 1 +#endif + +#ifndef R123_USE_CPUID_MSVC +#define R123_USE_CPUID_MSVC 0 +#endif + +#ifndef R123_USE_X86INTRIN_H +#define R123_USE_X86INTRIN_H 0 +#endif + +#ifndef R123_USE_IA32INTRIN_H +#define R123_USE_IA32INTRIN_H 0 +#endif + +/* emmintrin.h from PGI #includes xmmintrin.h but then complains at link time + about undefined references to _mm_castsi128_ps(__m128i). Why? */ +#ifndef R123_USE_XMMINTRIN_H +#define R123_USE_XMMINTRIN_H 1 +#endif + +#ifndef R123_USE_EMMINTRIN_H +#define R123_USE_EMMINTRIN_H 1 +#endif + +#ifndef R123_USE_SMMINTRIN_H +#define R123_USE_SMMINTRIN_H 0 +#endif + +#ifndef R123_USE_WMMINTRIN_H +#define R123_USE_WMMINTRIN_H 0 +#endif + +#ifndef R123_USE_INTRIN_H +#ifdef __ABM__ +#define R123_USE_INTRIN_H 1 +#else +#define R123_USE_INTRIN_H 0 +#endif +#endif + +#ifndef R123_USE_MULHILO32_ASM +#define R123_USE_MULHILO32_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_MULHI_INTRIN +#define R123_USE_MULHILO64_MULHI_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 1 +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 0 +#endif + +#ifndef __STDC_CONSTANT_MACROS +#define __STDC_CONSTANT_MACROS +#endif +#include <stdint.h> +#ifndef UINT64_C +#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h> +#endif + +/* If you add something, it must go in all the other XXfeatures.hpp + and in ../ut_features.cpp */ +#endif diff --git a/ext/random123/include/Random123/features/sse.h b/ext/random123/include/Random123/features/sse.h new file mode 100644 index 0000000000000000000000000000000000000000..3a49ebd8652e0d8deb50ca0daac50f56818ca6e5 --- /dev/null +++ b/ext/random123/include/Random123/features/sse.h @@ -0,0 +1,280 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _Random123_sse_dot_h__ +#define _Random123_sse_dot_h__ + +#if R123_USE_SSE + +#if R123_USE_X86INTRIN_H +#include <x86intrin.h> +#endif +#if R123_USE_IA32INTRIN_H +#include <ia32intrin.h> +#endif +#if R123_USE_XMMINTRIN_H +#include <xmmintrin.h> +#endif +#if R123_USE_EMMINTRIN_H +#include <emmintrin.h> +#endif +#if R123_USE_SMMINTRIN_H +#include <smmintrin.h> +#endif +#if R123_USE_WMMINTRIN_H +#include <wmmintrin.h> +#endif +#if R123_USE_INTRIN_H +#include <intrin.h> +#endif +#ifdef __cplusplus +#include <iostream> +#include <limits> +#include <stdexcept> +#endif + +#if R123_USE_ASM_GNU + +/* bit25 of CX tells us whether AES is enabled. */ +R123_STATIC_INLINE int haveAESNI(){ + unsigned int eax, ebx, ecx, edx; + __asm__ __volatile__ ("cpuid": "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : + "a" (1)); + return (ecx>>25) & 1; +} +#elif R123_USE_CPUID_MSVC +R123_STATIC_INLINE int haveAESNI(){ + int CPUInfo[4]; + __cpuid(CPUInfo, 1); + return (CPUInfo[2]>>25)&1; +} +#else /* R123_USE_CPUID_??? */ +#warning "No R123_USE_CPUID_XXX method chosen. haveAESNI will always return false" +R123_STATIC_INLINE int haveAESNI(){ + return 0; +} +#endif /* R123_USE_ASM_GNU || R123_USE_CPUID_MSVC */ + +// There is a lot of annoying and inexplicable variation in the +// SSE intrinsics available in different compilation environments. +// The details seem to depend on the compiler, the version and +// the target architecture. Rather than insisting on +// R123_USE_feature tests for each of these in each of the +// compilerfeatures.h files we just keep the complexity localized +// to here... +#if (defined(__ICC) && __ICC<1210) || (defined(_MSC_VER) && !defined(_WIN64)) +/* Is there an intrinsic to assemble an __m128i from two 64-bit words? + If not, use the 4x32-bit intrisic instead. N.B. It looks like Intel + added _mm_set_epi64x to icc version 12.1 in Jan 2012. +*/ +R123_STATIC_INLINE __m128i _mm_set_epi64x(uint64_t v1, uint64_t v0){ + union{ + uint64_t u64; + uint32_t u32[2]; + } u1, u0; + u1.u64 = v1; + u0.u64 = v0; + return _mm_set_epi32(u1.u32[1], u1.u32[0], u0.u32[1], u0.u32[0]); +} +#endif +/* _mm_extract_lo64 abstracts the task of extracting the low 64-bit + word from an __m128i. The _mm_cvtsi128_si64 intrinsic does the job + on 64-bit platforms. Unfortunately, both MSVC and Open64 fail + assertions in ut_M128.cpp and ut_carray.cpp when we use the + _mm_cvtsi128_si64 intrinsic. (See + https://bugs.open64.net/show_bug.cgi?id=873 for the Open64 bug). + On 32-bit platforms, there's no MOVQ, so there's no intrinsic. + Finally, even if the intrinsic exists, it may be spelled with or + without the 'x'. +*/ +#if !defined(__x86_64__) || defined(_MSC_VER) || defined(__OPEN64__) +R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){ + union{ + uint64_t u64[2]; + __m128i m; + }u; + _mm_store_si128(&u.m, si); + return u.u64[0]; +} +#elif defined(__llvm__) || defined(__ICC) +R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){ + return (uint64_t)_mm_cvtsi128_si64(si); +} +#else /* GNUC, others */ +/* FWIW, gcc's emmintrin.h has had the 'x' spelling + since at least gcc-3.4.4. The no-'x' spelling showed up + around 4.2. */ +R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){ + return (uint64_t)_mm_cvtsi128_si64x(si); +} +#endif +#if defined(__GNUC__) && __GNUC__ < 4 +/* the cast builtins showed up in gcc4. */ +R123_STATIC_INLINE __m128 _mm_castsi128_ps(__m128i si){ + return (__m128)si; +} +#endif + +#ifdef __cplusplus + +struct r123m128i{ + __m128i m; +#if R123_USE_CXX11_UNRESTRICTED_UNIONS + // C++98 forbids a union member from having *any* constructors. + // C++11 relaxes this, and allows union members to have constructors + // as long as there is a "trivial" default construtor. So in C++11 + // we can provide a r123m128i constructor with an __m128i argument, and still + // have the default (and hence trivial) default constructor. + r123m128i() = default; + r123m128i(__m128i _m): m(_m){} +#endif + r123m128i& operator=(const __m128i& rhs){ m=rhs; return *this;} + r123m128i& operator=(R123_ULONG_LONG n){ m = _mm_set_epi64x(0, n); return *this;} +#if R123_USE_CXX11_EXPLICIT_CONVERSIONS + // With C++11 we can attach explicit to the bool conversion operator + // to disambiguate undesired promotions. For g++, this works + // only in 4.5 and above. + explicit operator bool() const {return _bool();} +#else + // Pre-C++11, we have to do something else. Google for the "safe bool" + // idiom for other ideas... + operator const void*() const{return _bool()?this:0;} +#endif + operator __m128i() const {return m;} + +private: +#if R123_USE_SSE4_1 + bool _bool() const{ return !_mm_testz_si128(m,m); } +#else + bool _bool() const{ return 0xf != _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(m, _mm_setzero_si128()))); } +#endif +}; + +R123_STATIC_INLINE r123m128i& operator++(r123m128i& v){ + __m128i& c = v.m; + __m128i zeroone = _mm_set_epi64x(R123_64BIT(0), R123_64BIT(1)); + c = _mm_add_epi64(c, zeroone); + //return c; +#if R123_USE_SSE4_1 + __m128i zerofff = _mm_set_epi64x(0, ~(R123_64BIT(0))); + if( R123_BUILTIN_EXPECT(_mm_testz_si128(c,zerofff), 0) ){ + __m128i onezero = _mm_set_epi64x(R123_64BIT(1), R123_64BIT(0)); + c = _mm_add_epi64(c, onezero); + } +#else + unsigned mask = _mm_movemask_ps( _mm_castsi128_ps(_mm_cmpeq_epi32(c, _mm_setzero_si128()))); + // The low two bits of mask are 11 iff the low 64 bits of + // c are zero. + if( R123_BUILTIN_EXPECT((mask&0x3) == 0x3, 0) ){ + __m128i onezero = _mm_set_epi64x(1,0); + c = _mm_add_epi64(c, onezero); + } +#endif + return v; +} + +R123_STATIC_INLINE r123m128i& operator+=(r123m128i& lhs, R123_ULONG_LONG n){ + __m128i c = lhs.m; + __m128i incr128 = _mm_set_epi64x(0, n); + c = _mm_add_epi64(c, incr128); + // return c; // NO CARRY! + + int64_t lo64 = _mm_extract_lo64(c); + if((uint64_t)lo64 < n) + c = _mm_add_epi64(c, _mm_set_epi64x(1,0)); + lhs.m = c; + return lhs; +} + +// We need this one because it's present, but never used in r123array1xm128i::incr +R123_STATIC_INLINE bool operator<=(R123_ULONG_LONG, const r123m128i &){ + throw std::runtime_error("operator<=(unsigned long long, r123m128i) is unimplemented.");} + +// The comparisons aren't implemented, but if we leave them out, and +// somebody writes, e.g., M1 < M2, the compiler will do an implicit +// conversion through void*. Sigh... +R123_STATIC_INLINE bool operator<(const r123m128i&, const r123m128i&){ + throw std::runtime_error("operator<(r123m128i, r123m128i) is unimplemented.");} +R123_STATIC_INLINE bool operator<=(const r123m128i&, const r123m128i&){ + throw std::runtime_error("operator<=(r123m128i, r123m128i) is unimplemented.");} +R123_STATIC_INLINE bool operator>(const r123m128i&, const r123m128i&){ + throw std::runtime_error("operator>(r123m128i, r123m128i) is unimplemented.");} +R123_STATIC_INLINE bool operator>=(const r123m128i&, const r123m128i&){ + throw std::runtime_error("operator>=(r123m128i, r123m128i) is unimplemented.");} + +R123_STATIC_INLINE bool operator==(const r123m128i &lhs, const r123m128i &rhs){ + return 0xf==_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(lhs, rhs))); } +R123_STATIC_INLINE bool operator!=(const r123m128i &lhs, const r123m128i &rhs){ + return !(lhs==rhs);} +R123_STATIC_INLINE bool operator==(R123_ULONG_LONG lhs, const r123m128i &rhs){ + r123m128i LHS; LHS.m=_mm_set_epi64x(0, lhs); return LHS == rhs; } +R123_STATIC_INLINE bool operator!=(R123_ULONG_LONG lhs, const r123m128i &rhs){ + return !(lhs==rhs);} +R123_STATIC_INLINE std::ostream& operator<<(std::ostream& os, const r123m128i& m){ + union{ + uint64_t u64[2]; + __m128i m; + }u; + _mm_storeu_si128(&u.m, m.m); + return os << u.u64[0] << " " << u.u64[1]; +} + +R123_STATIC_INLINE std::istream& operator>>(std::istream& is, r123m128i& m){ + uint64_t u64[2]; + is >> u64[0] >> u64[1]; + m.m = _mm_set_epi64x(u64[1], u64[0]); + return is; +} + +template<typename T> inline T assemble_from_u32(uint32_t *p32); // forward declaration + +template <> +inline r123m128i assemble_from_u32<r123m128i>(uint32_t *p32){ + r123m128i ret; + ret.m = _mm_set_epi32(p32[3], p32[2], p32[1], p32[0]); + return ret; +} + +#else + +typedef struct { + __m128i m; +} r123m128i; + +#endif /* __cplusplus */ + +#else /* !R123_USE_SSE */ +R123_STATIC_INLINE int haveAESNI(){ + return 0; +} +#endif /* R123_USE_SSE */ + +#endif /* _Random123_sse_dot_h__ */ diff --git a/ext/random123/include/Random123/features/sunprofeatures.h b/ext/random123/include/Random123/features/sunprofeatures.h new file mode 100644 index 0000000000000000000000000000000000000000..c9cdc00f5e8f970898ae577b14fa910ceb135a91 --- /dev/null +++ b/ext/random123/include/Random123/features/sunprofeatures.h @@ -0,0 +1,172 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __sunprofeatures_dot_hpp +#define __sunprofeatures_dot_hpp + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE static inline +#endif + +#ifndef R123_FORCE_INLINE +#define R123_FORCE_INLINE(decl) decl +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#include <assert.h> +#define R123_ASSERT(x) assert(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) expr +#endif + +// The basic idiom is: +// #ifndef R123_SOMETHING +// #if some condition +// #define R123_SOMETHING 1 +// #else +// #define R123_SOMETHING 0 +// #endif +// #endif +// This idiom allows an external user to override any decision +// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0 + +// An alternative idiom is: +// #ifndef R123_SOMETHING +// #define R123_SOMETHING (some boolean expression) +// #endif +// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE +// pp-symbols. + +#ifndef R123_USE_AES_NI +#define R123_USE_AES_NI 0 +#endif + +#ifndef R123_USE_SSE4_2 +#define R123_USE_SSE4_2 0 +#endif + +#ifndef R123_USE_SSE4_1 +#define R123_USE_SSE4_1 0 +#endif + +#ifndef R123_USE_SSE +#define R123_USE_SSE 0 +#endif + +#ifndef R123_USE_AES_OPENSSL +#define R123_USE_AES_OPENSSL 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_ASM_GNU +#define R123_USE_ASM_GNU 0 +#endif + +#ifndef R123_USE_CPUID_MSVC +#define R123_USE_CPUID_MSVC 0 +#endif + +#ifndef R123_USE_X86INTRIN_H +#define R123_USE_X86INTRIN_H 0 +#endif + +#ifndef R123_USE_IA32INTRIN_H +#define R123_USE_IA32INTRIN_H 0 +#endif + +#ifndef R123_USE_XMMINTRIN_H +#define R123_USE_XMMINTRIN_H 0 +#endif + +#ifndef R123_USE_EMMINTRIN_H +#define R123_USE_EMMINTRIN_H 0 +#endif + +#ifndef R123_USE_SMMINTRIN_H +#define R123_USE_SMMINTRIN_H 0 +#endif + +#ifndef R123_USE_WMMINTRIN_H +#define R123_USE_WMMINTRIN_H 0 +#endif + +#ifndef R123_USE_INTRIN_H +#define R123_USE_INTRIN_H 0 +#endif + +#ifndef R123_USE_MULHILO16_ASM +#define R123_USE_MULHILO16_ASM 0 +#endif + +#ifndef R123_USE_MULHILO32_ASM +#define R123_USE_MULHILO32_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 0 +#endif + +#ifndef R123_USE_PHILOX_64BIT +#define R123_USE_PHILOX_64BIT 0 +#endif + +#ifndef __STDC_CONSTANT_MACROS +#define __STDC_CONSTANT_MACROS +#endif +#include <stdint.h> +#ifndef UINT64_C +#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h> +#endif + +// If you add something, it must go in all the other XXfeatures.hpp +// and in ../ut_features.cpp +#endif diff --git a/ext/random123/include/Random123/features/xlcfeatures.h b/ext/random123/include/Random123/features/xlcfeatures.h new file mode 100644 index 0000000000000000000000000000000000000000..ccb98ee5531c57253410eb4d1fe3692ec289bbca --- /dev/null +++ b/ext/random123/include/Random123/features/xlcfeatures.h @@ -0,0 +1,210 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Copyright (c) 2013, Los Alamos National Security, LLC +All rights reserved. + +Copyright 2013. Los Alamos National Security, LLC. This software was produced +under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National +Laboratory (LANL), which is operated by Los Alamos National Security, LLC for +the U.S. Department of Energy. The U.S. Government has rights to use, +reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS +ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR +ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified +to produce derivative works, such modified software should be clearly marked, +so as not to confuse it with the version available from LANL. +*/ +#ifndef __xlcfeatures_dot_hpp +#define __xlcfeatures_dot_hpp + +#if !defined(__x86_64__) && !defined(__i386__) && !defined(__powerpc__) +# error "This code has only been tested on x86 and PowerPC platforms." +#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task> +{ /* maybe an unbalanced brace will terminate the compilation */ + /* Feel free to try the Random123 library on other architectures by changing + the conditions that reach this error, but you should consider it a + porting exercise and expect to encounter bugs and deficiencies. + Please let the authors know of any successes (or failures). */ +#endif + +#ifdef __cplusplus +/* builtins are automatically available to xlc. To use them with xlc++, + one must include builtins.h. c.f + http://publib.boulder.ibm.com/infocenter/cellcomp/v101v121/index.jsp?topic=/com.ibm.xlcpp101.cell.doc/compiler_ref/compiler_builtins.html +*/ +#include <builtins.h> +#endif + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE static inline +#endif + +#ifndef R123_FORCE_INLINE +#define R123_FORCE_INLINE(decl) decl __attribute__((__always_inline__)) +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#include <assert.h> +#define R123_ASSERT(x) assert(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely) +#endif + +#ifndef R123_USE_AES_NI +#define R123_USE_AES_NI 0 +#endif + +#ifndef R123_USE_SSE4_2 +#define R123_USE_SSE4_2 0 +#endif + +#ifndef R123_USE_SSE4_1 +#define R123_USE_SSE4_1 0 +#endif + +#ifndef R123_USE_SSE +#define R123_USE_SSE 0 +#endif + +#ifndef R123_USE_AES_OPENSSL +/* There isn't really a good way to tell at compile time whether + openssl is available. Without a pre-compilation configure-like + tool, it's less error-prone to guess that it isn't available. Add + -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to + play with openssl */ +#define R123_USE_AES_OPENSSL 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_ASM_GNU +#define R123_USE_ASM_GNU 1 +#endif + +#ifndef R123_USE_CPUID_MSVC +#define R123_USE_CPUID_MSVC 0 +#endif + +#ifndef R123_USE_X86INTRIN_H +#define R123_USE_X86INTRIN_H 0 +#endif + +#ifndef R123_USE_IA32INTRIN_H +#define R123_USE_IA32INTRIN_H 0 +#endif + +#ifndef R123_USE_XMMINTRIN_H +#define R123_USE_XMMINTRIN_H 0 +#endif + +#ifndef R123_USE_EMMINTRIN_H +#define R123_USE_EMMINTRIN_H 0 +#endif + +#ifndef R123_USE_SMMINTRIN_H +#define R123_USE_SMMINTRIN_H 0 +#endif + +#ifndef R123_USE_WMMINTRIN_H +#define R123_USE_WMMINTRIN_H 0 +#endif + +#ifndef R123_USE_INTRIN_H +#ifdef __ABM__ +#define R123_USE_INTRIN_H 1 +#else +#define R123_USE_INTRIN_H 0 +#endif +#endif + +#ifndef R123_USE_MULHILO32_ASM +#define R123_USE_MULHILO32_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_MULHI_INTRIN +#if (defined(__powerpc64__)) +#define R123_USE_MULHILO64_MULHI_INTRIN 1 +#else +#define R123_USE_MULHILO64_MULHI_INTRIN 0 +#endif +#endif + +#ifndef R123_MULHILO64_MULHI_INTRIN +#define R123_MULHILO64_MULHI_INTRIN __mulhdu +#endif + +#ifndef R123_USE_MULHILO32_MULHI_INTRIN +#define R123_USE_MULHILO32_MULHI_INTRIN 0 +#endif + +#ifndef R123_MULHILO32_MULHI_INTRIN +#define R123_MULHILO32_MULHI_INTRIN __mulhwu +#endif + +#ifndef R123_USE_MULHILO64_ASM +#if defined(__powerpc64__) +#define R123_USE_MULHILO64_ASM (1 /*defined(__powerpc64__)*/ && !(R123_USE_MULHILO64_MULHI_INTRIN)) +#else +#define R123_USE_MULHILO64_ASM (0 /*defined(__powerpc64__)*/ && !(R123_USE_MULHILO64_MULHI_INTRIN)) +#endif +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 0 +#endif + +#ifndef __STDC_CONSTANT_MACROS +#define __STDC_CONSTANT_MACROS +#endif +#include <stdint.h> +#ifndef UINT64_C +#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h> +#endif + +/* If you add something, it must go in all the other XXfeatures.hpp + and in ../ut_features.cpp */ +#endif diff --git a/ext/random123/include/Random123/gsl_microrng.h b/ext/random123/include/Random123/gsl_microrng.h new file mode 100644 index 0000000000000000000000000000000000000000..4f09412152687462506ce88650a5328f6787ae23 --- /dev/null +++ b/ext/random123/include/Random123/gsl_microrng.h @@ -0,0 +1,136 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __r123_gslmicrorng_dot_h__ +#define __r123_gslmicrorng_dot_h__ + + +#include <gsl/gsl_rng.h> +#include <string.h> + +/** The macro: GSL_MICRORNG(NAME, CBRNGNAME) is the GSL + analog analog of the C++ r123::MicroURNG template. It declares a gsl_rng + type named gsl_rng_NAME which uses the underlying CBRNGNAME + and can be invoked a limited number of times between calls to NAME_reset. + + When the underlying CBRNG's \c ctr_t is an \ref arrayNxW "r123arrayNxW", + and the gsl_rng_NAME may called up to \c N*2^32 times + between calls to \c NAME_reset. + + \c NAME_reset takes a gsl_rng_NAME type, a counter and a key as arguments. + It restarts the micro-rng with a new base counter and key. + + Note that you must call NAME_reset before the first use + of a gsl_rng. NAME_reset is not called automatically by + gsl_rng_alloc(). + + @code + #include <Random123/threefry.h> + #include <Random123/gsl_microrng.h> // this file + GSL_MICRORNG(microcbrng, threefry4x64, 20) // creates gsl_rng_microcbrng + + int main(int argc, char** argv) { + gsl_rng *r = gsl_rng_alloc(gsl_rng_microcbrng); + threefry4x64_ctr_t c = {{}}; + threefry4x64_key_t k = {{}}; + + for (...) { + c.v[0] = ??; // some application variable + microcbrng_reset(r, c, k); + for (...) { + // gaussian calls r several times. It is safe for + // r to be used upto 2^20 times in this loop + something[i] = gsl_ran_gaussian(r, 1.5); + } + } + } + @endcode + +*/ + +#define GSL_MICRORNG(NAME, CBRNGNAME) \ +const gsl_rng_type *gsl_rng_##NAME; \ + \ +typedef struct{ \ + CBRNGNAME##_ctr_t ctr; \ + CBRNGNAME##_ctr_t r; \ + CBRNGNAME##_key_t key; \ + R123_ULONG_LONG n; \ + int elem; \ +} NAME##_state; \ + \ +static unsigned long int NAME##_get(void *vstate){ \ + NAME##_state *st = (NAME##_state *)vstate; \ + const int N=sizeof(st->ctr.v)/sizeof(st->ctr.v[0]); \ + if( st->elem == 0 ){ \ + CBRNGNAME##_ctr_t c = st->ctr; \ + c.v[N-1] |= st->n<<(R123_W(CBRNGNAME##_ctr_t)-32); \ + st->n++; \ + st->r = CBRNGNAME(c, st->key); \ + st->elem = N; \ + } \ + return 0xffffffff & st->r.v[--st->elem]; \ +} \ + \ +static double \ +NAME##_get_double (void * vstate) \ +{ \ + return NAME##_get (vstate)/4294967296.; \ +} \ + \ +static void NAME##_set(void *vstate, unsigned long int s){ \ + NAME##_state *st = (NAME##_state *)vstate; \ + (void)s; /* ignored */ \ + st->elem = 0; \ + st->n = ~0; /* will abort if _reset is not called */ \ +} \ + \ +static const gsl_rng_type NAME##_type = { \ + #NAME, \ + 0xffffffffUL, \ + 0, \ + sizeof(NAME##_state), \ + &NAME##_set, \ + &NAME##_get, \ + &NAME##_get_double \ +}; \ + \ +R123_STATIC_INLINE void NAME##_reset(const gsl_rng* gr, CBRNGNAME##_ctr_t c, CBRNGNAME##_key_t k) { \ + NAME##_state* state = (NAME##_state *)gr->state; \ + state->ctr = c; \ + state->key = k; \ + state->n = 0; \ + state->elem = 0; \ +} \ + \ +const gsl_rng_type *gsl_rng_##NAME = &NAME##_type + +#endif diff --git a/ext/random123/include/Random123/philox.h b/ext/random123/include/Random123/philox.h new file mode 100644 index 0000000000000000000000000000000000000000..7bf4d195772358a87b8fbb33667783b5caba61a4 --- /dev/null +++ b/ext/random123/include/Random123/philox.h @@ -0,0 +1,493 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _philox_dot_h_ +#define _philox_dot_h_ + +/** \cond HIDDEN_FROM_DOXYGEN */ + +#include "features/compilerfeatures.h" +#include "array.h" + + +/* +// Macros _Foo_tpl are code generation 'templates' They define +// inline functions with names obtained by mangling Foo and the +// macro arguments. E.g., +// _mulhilo_tpl(32, uint32_t, uint64_t) +// expands to a definition of: +// mulhilo32(uint32_t, uint32_t, uint32_t *, uint32_t *) +// We then 'instantiate the template' to define +// several different functions, e.g., +// mulhilo32 +// mulhilo64 +// These functions will be visible to user code, and may +// also be used later in subsequent templates and definitions. + +// A template for mulhilo using a temporary of twice the word-width. +// Gcc figures out that this can be reduced to a single 'mul' instruction, +// despite the apparent use of double-wide variables, shifts, etc. It's +// obviously not guaranteed that all compilers will be that smart, so +// other implementations might be preferable, e.g., using an intrinsic +// or an asm block. On the other hand, for 32-bit multiplies, +// this *is* perfectly standard C99 - any C99 compiler should +// understand it and produce correct code. For 64-bit multiplies, +// it's only usable if the compiler recognizes that it can do +// arithmetic on a 128-bit type. That happens to be true for gcc on +// x86-64, and powerpc64 but not much else. +*/ +#define _mulhilo_dword_tpl(W, Word, Dword) \ +R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \ + Dword product = ((Dword)a)*((Dword)b); \ + *hip = product>>W; \ + return (Word)product; \ +} + +/* +// A template for mulhilo using gnu-style asm syntax. +// INSN can be "mulw", "mull" or "mulq". +// FIXME - porting to other architectures, we'll need still-more conditional +// branching here. Note that intrinsics are usually preferable. +*/ +#ifdef __powerpc__ +#define _mulhilo_asm_tpl(W, Word, INSN) \ +R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){ \ + Word dx = 0; \ + __asm__("\n\t" \ + INSN " %0,%1,%2\n\t" \ + : "=r"(dx) \ + : "r"(b), "r"(ax) \ + ); \ + *hip = dx; \ + return ax*b; \ +} +#else +#define _mulhilo_asm_tpl(W, Word, INSN) \ +R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){ \ + Word dx; \ + __asm__("\n\t" \ + INSN " %2\n\t" \ + : "=a"(ax), "=d"(dx) \ + : "r"(b), "0"(ax) \ + ); \ + *hip = dx; \ + return ax; \ +} +#endif /* __powerpc__ */ + +/* +// A template for mulhilo using MSVC-style intrinsics +// For example,_umul128 is an msvc intrinsic, c.f. +// http://msdn.microsoft.com/en-us/library/3dayytw9.aspx +*/ +#define _mulhilo_msvc_intrin_tpl(W, Word, INTRIN) \ +R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \ + return INTRIN(a, b, hip); \ +} + +/* N.B. This really should be called _mulhilo_mulhi_intrin. It just + happens that CUDA was the first time we used the idiom. */ +#define _mulhilo_cuda_intrin_tpl(W, Word, INTRIN) \ +R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, R123_METAL_THREAD_ADDRESS_SPACE Word* hip){ \ + *hip = INTRIN(a, b); \ + return a*b; \ +} + +/* +// A template for mulhilo using only word-size operations and +// C99 operators (no adc, no mulhi). It +// requires four multiplies and a dozen or so shifts, adds +// and tests. It's *SLOW*. It can be used to +// implement philoxNx32 on platforms that completely lack +// 64-bit types, e.g., Metal. +// On 32-bit platforms, it could be used to +// implement philoxNx64, but on such platforms both the philoxNx32 +// and the threefryNx64 cbrngs are going to have much better +// performance. It is enabled below by R123_USE_MULHILO64_C99, +// but that is currently (Feb 2019) only set by +// features/metalfeatures.h headers. It can, of course, be +// set with a compile-time -D option. +*/ +#define _mulhilo_c99_tpl(W, Word) \ +R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, R123_METAL_THREAD_ADDRESS_SPACE Word *hip){ \ + const unsigned WHALF = W/2; \ + const Word LOMASK = ((((Word)1)<<WHALF)-1); \ + Word lo = a*b; /* full low multiply */ \ + Word ahi = a>>WHALF; \ + Word alo = a& LOMASK; \ + Word bhi = b>>WHALF; \ + Word blo = b& LOMASK; \ + \ + Word ahbl = ahi*blo; \ + Word albh = alo*bhi; \ + \ + Word ahbl_albh = ((ahbl&LOMASK) + (albh&LOMASK)); \ + Word hi = ahi*bhi + (ahbl>>WHALF) + (albh>>WHALF); \ + hi += ahbl_albh >> WHALF; /* carry from the sum of lo(ahbl) + lo(albh) ) */ \ + /* carry from the sum with alo*blo */ \ + hi += ((lo >> WHALF) < (ahbl_albh&LOMASK)); \ + *hip = hi; \ + return lo; \ +} + +/* +// A template for mulhilo on a platform that can't do it +// We could put a C version here, but is it better to run *VERY* +// slowly or to just stop and force the user to find another CBRNG? +*/ +#define _mulhilo_fail_tpl(W, Word) \ +R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word *hip){ \ + R123_STATIC_ASSERT(0, "mulhilo" #W " is not implemented on this machine\n"); \ +} + +/* +// N.B. There's an MSVC intrinsic called _emul, +// which *might* compile into better code than +// _mulhilo_dword_tpl +*/ +#if R123_USE_MULHILO32_ASM +#ifdef __powerpc__ +_mulhilo_asm_tpl(32, uint32_t, "mulhwu") +#else +_mulhilo_asm_tpl(32, uint32_t, "mull") +#endif /* __powerpc__ */ +#else +#if R123_USE_64BIT +_mulhilo_dword_tpl(32, uint32_t, uint64_t) +#elif R123_USE_MULHILO32_MULHI_INTRIN +_mulhilo_cuda_intrin_tpl(32, uint32_t, R123_MULHILO32_MULHI_INTRIN) +#else +_mulhilo_c99_tpl(32, uint32_t) +#endif +#endif + +#if R123_USE_PHILOX_64BIT +#if R123_USE_MULHILO64_ASM +#ifdef __powerpc64__ +_mulhilo_asm_tpl(64, uint64_t, "mulhdu") +#else +_mulhilo_asm_tpl(64, uint64_t, "mulq") +#endif /* __powerpc64__ */ +#elif R123_USE_MULHILO64_MSVC_INTRIN +_mulhilo_msvc_intrin_tpl(64, uint64_t, _umul128) +#elif R123_USE_MULHILO64_CUDA_INTRIN +_mulhilo_cuda_intrin_tpl(64, uint64_t, __umul64hi) +#elif R123_USE_MULHILO64_OPENCL_INTRIN +_mulhilo_cuda_intrin_tpl(64, uint64_t, mul_hi) +#elif R123_USE_MULHILO64_MULHI_INTRIN +_mulhilo_cuda_intrin_tpl(64, uint64_t, R123_MULHILO64_MULHI_INTRIN) +#elif R123_USE_GNU_UINT128 +_mulhilo_dword_tpl(64, uint64_t, __uint128_t) +#elif R123_USE_MULHILO64_C99 +_mulhilo_c99_tpl(64, uint64_t) +#else +_mulhilo_fail_tpl(64, uint64_t) +#endif +#endif + +/* +// The multipliers and Weyl constants are "hard coded". +// To change them, you can #define them with different +// values before #include-ing this file. +// This isn't terribly elegant, but it works for C as +// well as C++. A nice C++-only solution would be to +// use template parameters in the style of <random> +*/ +#ifndef PHILOX_M2x64_0 +#define PHILOX_M2x64_0 R123_64BIT(0xD2B74407B1CE6E93) +#endif + +#ifndef PHILOX_M4x64_0 +#define PHILOX_M4x64_0 R123_64BIT(0xD2E7470EE14C6C93) +#endif + +#ifndef PHILOX_M4x64_1 +#define PHILOX_M4x64_1 R123_64BIT(0xCA5A826395121157) +#endif + +#ifndef PHILOX_M2x32_0 +#define PHILOX_M2x32_0 ((uint32_t)0xd256d193) +#endif + +#ifndef PHILOX_M4x32_0 +#define PHILOX_M4x32_0 ((uint32_t)0xD2511F53) +#endif +#ifndef PHILOX_M4x32_1 +#define PHILOX_M4x32_1 ((uint32_t)0xCD9E8D57) +#endif + +#ifndef PHILOX_W64_0 +#define PHILOX_W64_0 R123_64BIT(0x9E3779B97F4A7C15) /* golden ratio */ +#endif +#ifndef PHILOX_W64_1 +#define PHILOX_W64_1 R123_64BIT(0xBB67AE8584CAA73B) /* sqrt(3)-1 */ +#endif + +#ifndef PHILOX_W32_0 +#define PHILOX_W32_0 ((uint32_t)0x9E3779B9) +#endif +#ifndef PHILOX_W32_1 +#define PHILOX_W32_1 ((uint32_t)0xBB67AE85) +#endif + +/** \endcond */ +#ifndef PHILOX2x32_DEFAULT_ROUNDS +#define PHILOX2x32_DEFAULT_ROUNDS 10 +#endif + +#ifndef PHILOX2x64_DEFAULT_ROUNDS +#define PHILOX2x64_DEFAULT_ROUNDS 10 +#endif + +#ifndef PHILOX4x32_DEFAULT_ROUNDS +#define PHILOX4x32_DEFAULT_ROUNDS 10 +#endif + +#ifndef PHILOX4x64_DEFAULT_ROUNDS +#define PHILOX4x64_DEFAULT_ROUNDS 10 +#endif +/** \cond HIDDEN_FROM_DOXYGEN */ + +/* The ignored fourth argument allows us to instantiate the + same macro regardless of N. */ +#define _philox2xWround_tpl(W, T) \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key){ \ + T hi; \ + T lo = mulhilo##W(PHILOX_M2x##W##_0, ctr.v[0], &hi); \ + struct r123array2x##W out = {{hi^key.v[0]^ctr.v[1], lo}}; \ + return out; \ +} +#define _philox2xWbumpkey_tpl(W) \ +R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array1x##W _philox2x##W##bumpkey( struct r123array1x##W key) { \ + key.v[0] += PHILOX_W##W##_0; \ + return key; \ +} + +#define _philox4xWround_tpl(W, T) \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key){ \ + T hi0; \ + T hi1; \ + T lo0 = mulhilo##W(PHILOX_M4x##W##_0, ctr.v[0], &hi0); \ + T lo1 = mulhilo##W(PHILOX_M4x##W##_1, ctr.v[2], &hi1); \ + struct r123array4x##W out = {{hi1^ctr.v[1]^key.v[0], lo1, \ + hi0^ctr.v[3]^key.v[1], lo0}}; \ + return out; \ +} + +#define _philox4xWbumpkey_tpl(W) \ +R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox4x##W##bumpkey( struct r123array2x##W key) { \ + key.v[0] += PHILOX_W##W##_0; \ + key.v[1] += PHILOX_W##W##_1; \ + return key; \ +} + +/** \endcond */ +#define _philoxNxW_tpl(N, Nhalf, W, T) \ +/** @ingroup PhiloxNxW */ \ +enum r123_enum_philox##N##x##W { philox##N##x##W##_rounds = PHILOX##N##x##W##_DEFAULT_ROUNDS }; \ +typedef struct r123array##N##x##W philox##N##x##W##_ctr_t; \ +typedef struct r123array##Nhalf##x##W philox##N##x##W##_key_t; \ +typedef struct r123array##Nhalf##x##W philox##N##x##W##_ukey_t; \ +R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_key_t philox##N##x##W##keyinit(philox##N##x##W##_ukey_t uk) { return uk; } \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key) { \ + R123_ASSERT(R<=16); \ + if(R>0){ ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>1){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>2){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>3){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>4){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>5){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>6){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>7){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>8){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>9){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>10){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>11){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>12){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>13){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>14){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>15){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + return ctr; \ +} + +_philox2xWbumpkey_tpl(32) +_philox4xWbumpkey_tpl(32) +_philox2xWround_tpl(32, uint32_t) /* philox2x32round */ +_philox4xWround_tpl(32, uint32_t) /* philo4x32round */ + +_philoxNxW_tpl(2, 1, 32, uint32_t) /* philox2x32bijection */ +_philoxNxW_tpl(4, 2, 32, uint32_t) /* philox4x32bijection */ +#if R123_USE_PHILOX_64BIT +/** \cond HIDDEN_FROM_DOXYGEN */ +_philox2xWbumpkey_tpl(64) +_philox4xWbumpkey_tpl(64) +_philox2xWround_tpl(64, uint64_t) /* philo2x64round */ +_philox4xWround_tpl(64, uint64_t) /* philo4x64round */ +/** \endcond */ +_philoxNxW_tpl(2, 1, 64, uint64_t) /* philox2x64bijection */ +_philoxNxW_tpl(4, 2, 64, uint64_t) /* philox4x64bijection */ +#endif /* R123_USE_PHILOX_64BIT */ + +#define philox2x32(c,k) philox2x32_R(philox2x32_rounds, c, k) +#define philox4x32(c,k) philox4x32_R(philox4x32_rounds, c, k) +#if R123_USE_PHILOX_64BIT +#define philox2x64(c,k) philox2x64_R(philox2x64_rounds, c, k) +#define philox4x64(c,k) philox4x64_R(philox4x64_rounds, c, k) +#endif /* R123_USE_PHILOX_64BIT */ + +#if defined(__cplusplus) + +#define _PhiloxNxW_base_tpl(CType, KType, N, W) \ +namespace r123{ \ +template<unsigned int ROUNDS> \ +struct Philox##N##x##W##_R{ \ + typedef CType ctr_type; \ + typedef KType key_type; \ + typedef KType ukey_type; \ + static const R123_METAL_CONSTANT_ADDRESS_SPACE unsigned int rounds=ROUNDS; \ + inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key) const){ \ + R123_STATIC_ASSERT(ROUNDS<=16, "philox is only unrolled up to 16 rounds\n"); \ + return philox##N##x##W##_R(ROUNDS, ctr, key); \ + } \ +}; \ +typedef Philox##N##x##W##_R<philox##N##x##W##_rounds> Philox##N##x##W; \ + } // namespace r123 + +_PhiloxNxW_base_tpl(r123array2x32, r123array1x32, 2, 32) // Philox2x32_R<R> +_PhiloxNxW_base_tpl(r123array4x32, r123array2x32, 4, 32) // Philox4x32_R<R> +#if R123_USE_PHILOX_64BIT +_PhiloxNxW_base_tpl(r123array2x64, r123array1x64, 2, 64) // Philox2x64_R<R> +_PhiloxNxW_base_tpl(r123array4x64, r123array2x64, 4, 64) // Philox4x64_R<R> +#endif + +/* The _tpl macros don't quite work to do string-pasting inside comments. + so we just write out the boilerplate documentation four times... */ + +/** +@defgroup PhiloxNxW Philox Classes and Typedefs + +The PhiloxNxW classes export the member functions, typedefs and +operator overloads required by a @ref CBRNG "CBRNG" class. + +As described in +<a href="http://dl.acm.org/citation.cfm?doid=2063405"><i>Parallel Random Numbers: As Easy as 1, 2, 3</i> </a>. +The Philox family of counter-based RNGs use integer multiplication, xor and permutation of W-bit words +to scramble its N-word input key. Philox is a mnemonic for Product HI LO Xor). + + +@class r123::Philox2x32_R +@ingroup PhiloxNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Philox round +function will be applied. + +As of November 2011, the authors know of no statistical flaws with +ROUNDS=6 or more for Philox2x32. + +@typedef r123::Philox2x32 +@ingroup PhiloxNxW + Philox2x32 is equivalent to Philox2x32_R<10>. With 10 rounds, + Philox2x32 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. + + + +@class r123::Philox2x64_R +@ingroup PhiloxNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Philox round +function will be applied. + +As of September 2011, the authors know of no statistical flaws with +ROUNDS=6 or more for Philox2x64. + +@typedef r123::Philox2x64 +@ingroup PhiloxNxW + Philox2x64 is equivalent to Philox2x64_R<10>. With 10 rounds, + Philox2x64 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. + + + +@class r123::Philox4x32_R +@ingroup PhiloxNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Philox round +function will be applied. + +In November 2011, the authors recorded some suspicious p-values (approximately 1.e-7) from +some very long (longer than the default BigCrush length) SimpPoker tests. Despite +the fact that even longer tests reverted to "passing" p-values, a cloud remains over +Philox4x32 with 7 rounds. The authors know of no statistical flaws with +ROUNDS=8 or more for Philox4x32. + +@typedef r123::Philox4x32 +@ingroup PhiloxNxW + Philox4x32 is equivalent to Philox4x32_R<10>. With 10 rounds, + Philox4x32 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. + + + +@class r123::Philox4x64_R +@ingroup PhiloxNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Philox round +function will be applied. + +As of September 2011, the authors know of no statistical flaws with +ROUNDS=7 or more for Philox4x64. + +@typedef r123::Philox4x64 +@ingroup PhiloxNxW + Philox4x64 is equivalent to Philox4x64_R<10>. With 10 rounds, + Philox4x64 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. +*/ + +#endif /* __cplusplus */ + +#endif /* _philox_dot_h_ */ diff --git a/ext/random123/include/Random123/threefry.h b/ext/random123/include/Random123/threefry.h new file mode 100644 index 0000000000000000000000000000000000000000..390ceffe6865e6d23d7c69b38fb8f022abc532f6 --- /dev/null +++ b/ext/random123/include/Random123/threefry.h @@ -0,0 +1,870 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _threefry_dot_h_ +#define _threefry_dot_h_ +#include "features/compilerfeatures.h" +#include "array.h" + +/** \cond HIDDEN_FROM_DOXYGEN */ +/* Significant parts of this file were copied from + from: + Skein_FinalRnd/ReferenceImplementation/skein.h + Skein_FinalRnd/ReferenceImplementation/skein_block.c + + in http://csrc.nist.gov/groups/ST/hash/sha-3/Round3/documents/Skein_FinalRnd.zip + + This file has been modified so that it may no longer perform its originally + intended function. If you're looking for a Skein or Threefish source code, + please consult the original file. + + The original file had the following header: +************************************************************************** +** +** Interface declarations and internal definitions for Skein hashing. +** +** Source code author: Doug Whiting, 2008. +** +** This algorithm and source code is released to the public domain. +** +*************************************************************************** + +*/ + +/* See comment at the top of philox.h for the macro pre-process + strategy. */ + +/* Rotation constants: */ +enum r123_enum_threefry64x4 { + /* These are the R_256 constants from the Threefish reference sources + with names changed to R_64x4... */ + R_64x4_0_0=14, R_64x4_0_1=16, + R_64x4_1_0=52, R_64x4_1_1=57, + R_64x4_2_0=23, R_64x4_2_1=40, + R_64x4_3_0= 5, R_64x4_3_1=37, + R_64x4_4_0=25, R_64x4_4_1=33, + R_64x4_5_0=46, R_64x4_5_1=12, + R_64x4_6_0=58, R_64x4_6_1=22, + R_64x4_7_0=32, R_64x4_7_1=32 +}; + +enum r123_enum_threefry64x2 { + /* + // Output from skein_rot_search: (srs64_B64-X1000) + // Random seed = 1. BlockSize = 128 bits. sampleCnt = 1024. rounds = 8, minHW_or=57 + // Start: Tue Mar 1 10:07:48 2011 + // rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format + */ + R_64x2_0_0=16, + R_64x2_1_0=42, + R_64x2_2_0=12, + R_64x2_3_0=31, + R_64x2_4_0=16, + R_64x2_5_0=32, + R_64x2_6_0=24, + R_64x2_7_0=21 + /* 4 rounds: minHW = 4 [ 4 4 4 4 ] + // 5 rounds: minHW = 8 [ 8 8 8 8 ] + // 6 rounds: minHW = 16 [ 16 16 16 16 ] + // 7 rounds: minHW = 32 [ 32 32 32 32 ] + // 8 rounds: minHW = 64 [ 64 64 64 64 ] + // 9 rounds: minHW = 64 [ 64 64 64 64 ] + //10 rounds: minHW = 64 [ 64 64 64 64 ] + //11 rounds: minHW = 64 [ 64 64 64 64 ] */ +}; + +enum r123_enum_threefry32x4 { + /* Output from skein_rot_search: (srs-B128-X5000.out) + // Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28 + // Start: Mon Aug 24 22:41:36 2009 + // ... + // rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format */ + R_32x4_0_0=10, R_32x4_0_1=26, + R_32x4_1_0=11, R_32x4_1_1=21, + R_32x4_2_0=13, R_32x4_2_1=27, + R_32x4_3_0=23, R_32x4_3_1= 5, + R_32x4_4_0= 6, R_32x4_4_1=20, + R_32x4_5_0=17, R_32x4_5_1=11, + R_32x4_6_0=25, R_32x4_6_1=10, + R_32x4_7_0=18, R_32x4_7_1=20 + + /* 4 rounds: minHW = 3 [ 3 3 3 3 ] + // 5 rounds: minHW = 7 [ 7 7 7 7 ] + // 6 rounds: minHW = 12 [ 13 12 13 12 ] + // 7 rounds: minHW = 22 [ 22 23 22 23 ] + // 8 rounds: minHW = 31 [ 31 31 31 31 ] + // 9 rounds: minHW = 32 [ 32 32 32 32 ] + //10 rounds: minHW = 32 [ 32 32 32 32 ] + //11 rounds: minHW = 32 [ 32 32 32 32 ] */ + +}; + +enum r123_enum_threefry32x2 { + /* Output from skein_rot_search (srs32x2-X5000.out) + // Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28 + // Start: Tue Jul 12 11:11:33 2011 + // rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize= 64].format */ + R_32x2_0_0=13, + R_32x2_1_0=15, + R_32x2_2_0=26, + R_32x2_3_0= 6, + R_32x2_4_0=17, + R_32x2_5_0=29, + R_32x2_6_0=16, + R_32x2_7_0=24 + + /* 4 rounds: minHW = 4 [ 4 4 4 4 ] + // 5 rounds: minHW = 6 [ 6 8 6 8 ] + // 6 rounds: minHW = 9 [ 9 12 9 12 ] + // 7 rounds: minHW = 16 [ 16 24 16 24 ] + // 8 rounds: minHW = 32 [ 32 32 32 32 ] + // 9 rounds: minHW = 32 [ 32 32 32 32 ] + //10 rounds: minHW = 32 [ 32 32 32 32 ] + //11 rounds: minHW = 32 [ 32 32 32 32 ] */ + }; + +enum r123_enum_threefry_wcnt { + WCNT2=2, + WCNT4=4 +}; + +#if R123_USE_64BIT +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint64_t RotL_64(uint64_t x, unsigned int N)); +R123_CUDA_DEVICE R123_STATIC_INLINE uint64_t RotL_64(uint64_t x, unsigned int N) +{ + return (x << (N & 63)) | (x >> ((64-N) & 63)); +} +#endif + +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint32_t RotL_32(uint32_t x, unsigned int N)); +R123_CUDA_DEVICE R123_STATIC_INLINE uint32_t RotL_32(uint32_t x, unsigned int N) +{ + return (x << (N & 31)) | (x >> ((32-N) & 31)); +} + +#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32)) +#define SKEIN_KS_PARITY64 SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22) +#define SKEIN_KS_PARITY32 0x1BD11BDA + +/** \endcond */ + +#ifndef THREEFRY2x32_DEFAULT_ROUNDS +#define THREEFRY2x32_DEFAULT_ROUNDS 20 +#endif + +#ifndef THREEFRY2x64_DEFAULT_ROUNDS +#define THREEFRY2x64_DEFAULT_ROUNDS 20 +#endif + +#ifndef THREEFRY4x32_DEFAULT_ROUNDS +#define THREEFRY4x32_DEFAULT_ROUNDS 20 +#endif + +#ifndef THREEFRY4x64_DEFAULT_ROUNDS +#define THREEFRY4x64_DEFAULT_ROUNDS 20 +#endif + +#define _threefry2x_tpl(W) \ +typedef struct r123array2x##W threefry2x##W##_ctr_t; \ +typedef struct r123array2x##W threefry2x##W##_key_t; \ +typedef struct r123array2x##W threefry2x##W##_ukey_t; \ +R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_key_t threefry2x##W##keyinit(threefry2x##W##_ukey_t uk) { return uk; } \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE \ +threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \ + threefry2x##W##_ctr_t X; \ + uint##W##_t ks[2+1]; \ + int i; /* avoid size_t to avoid need for stddef.h */ \ + R123_ASSERT(Nrounds<=32); \ + ks[2] = SKEIN_KS_PARITY##W; \ + for (i=0;i < 2; i++) \ + { \ + ks[i] = k.v[i]; \ + X.v[i] = in.v[i]; \ + ks[2] ^= k.v[i]; \ + } \ + \ + /* Insert initial key before round 0 */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; \ + \ + if(Nrounds>0){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>1){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>2){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>3){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>3){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; \ + X.v[1] += 1; /* X.v[2-1] += r */ \ + } \ + if(Nrounds>4){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>5){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>6){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>7){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>7){ \ + /* InjectKey(r=2) */ \ + X.v[0] += ks[2]; X.v[1] += ks[0]; \ + X.v[1] += 2; \ + } \ + if(Nrounds>8){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>9){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>10){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>11){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>11){ \ + /* InjectKey(r=3) */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; \ + X.v[1] += 3; \ + } \ + if(Nrounds>12){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>13){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>14){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>15){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>15){ \ + /* InjectKey(r=4) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; \ + X.v[1] += 4; \ + } \ + if(Nrounds>16){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>17){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>18){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>19){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>19){ \ + /* InjectKey(r=5) */ \ + X.v[0] += ks[2]; X.v[1] += ks[0]; \ + X.v[1] += 5; \ + } \ + if(Nrounds>20){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>21){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>22){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>23){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>23){ \ + /* InjectKey(r=6) */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; \ + X.v[1] += 6; \ + } \ + if(Nrounds>24){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>25){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>26){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>27){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>27){ \ + /* InjectKey(r=7) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; \ + X.v[1] += 7; \ + } \ + if(Nrounds>28){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>29){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>30){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>31){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>31){ \ + /* InjectKey(r=8) */ \ + X.v[0] += ks[2]; X.v[1] += ks[0]; \ + X.v[1] += 8; \ + } \ + return X; \ +} \ + /** @ingroup ThreefryNxW */ \ +enum r123_enum_threefry2x##W { threefry2x##W##_rounds = THREEFRY2x##W##_DEFAULT_ROUNDS }; \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE \ +threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \ + return threefry2x##W##_R(threefry2x##W##_rounds, in, k); \ +} + + +#define _threefry4x_tpl(W) \ +typedef struct r123array4x##W threefry4x##W##_ctr_t; \ +typedef struct r123array4x##W threefry4x##W##_key_t; \ +typedef struct r123array4x##W threefry4x##W##_ukey_t; \ +R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_key_t threefry4x##W##keyinit(threefry4x##W##_ukey_t uk) { return uk; } \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE \ +threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \ + threefry4x##W##_ctr_t X; \ + uint##W##_t ks[4+1]; \ + int i; /* avoid size_t to avoid need for stddef.h */ \ + R123_ASSERT(Nrounds<=72); \ + ks[4] = SKEIN_KS_PARITY##W; \ + for (i=0;i < 4; i++) \ + { \ + ks[i] = k.v[i]; \ + X.v[i] = in.v[i]; \ + ks[4] ^= k.v[i]; \ + } \ + \ + /* Insert initial key before round 0 */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \ + \ + if(Nrounds>0){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>1){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>2){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>3){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>3){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \ + X.v[4-1] += 1; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>4){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>5){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>6){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>7){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>7){ \ + /* InjectKey(r=2) */ \ + X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \ + X.v[4-1] += 2; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>8){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>9){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>10){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>11){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>11){ \ + /* InjectKey(r=3) */ \ + X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \ + X.v[4-1] += 3; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>12){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>13){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>14){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>15){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>15){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \ + X.v[4-1] += 4; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>16){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>17){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>18){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>19){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>19){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \ + X.v[4-1] += 5; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>20){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>21){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>22){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>23){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>23){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \ + X.v[4-1] += 6; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>24){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>25){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>26){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>27){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>27){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \ + X.v[4-1] += 7; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>28){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>29){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>30){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>31){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>31){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \ + X.v[4-1] += 8; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>32){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>33){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>34){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>35){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>35){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \ + X.v[4-1] += 9; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>36){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>37){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>38){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>39){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>39){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \ + X.v[4-1] += 10; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>40){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>41){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>42){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>43){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>43){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \ + X.v[4-1] += 11; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>44){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>45){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>46){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>47){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>47){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \ + X.v[4-1] += 12; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>48){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>49){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>50){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>51){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>51){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \ + X.v[4-1] += 13; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>52){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>53){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>54){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>55){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>55){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \ + X.v[4-1] += 14; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>56){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>57){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>58){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>59){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>59){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \ + X.v[4-1] += 15; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>60){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>61){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>62){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>63){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>63){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \ + X.v[4-1] += 16; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>64){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>65){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>66){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>67){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>67){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \ + X.v[4-1] += 17; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>68){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>69){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>70){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>71){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>71){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \ + X.v[4-1] += 18; /* X.v[WCNT4-1] += r */ \ + } \ + \ + return X; \ +} \ + \ + /** @ingroup ThreefryNxW */ \ +enum r123_enum_threefry4x##W { threefry4x##W##_rounds = THREEFRY4x##W##_DEFAULT_ROUNDS }; \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE \ +threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \ + return threefry4x##W##_R(threefry4x##W##_rounds, in, k); \ +} + +#if R123_USE_64BIT +_threefry2x_tpl(64) +_threefry4x_tpl(64) +#endif +_threefry2x_tpl(32) +_threefry4x_tpl(32) + +/* gcc4.5 and 4.6 seem to optimize a macro-ized threefryNxW better + than a static inline function. Why? */ +#define threefry2x32(c,k) threefry2x32_R(threefry2x32_rounds, c, k) +#define threefry4x32(c,k) threefry4x32_R(threefry4x32_rounds, c, k) +#define threefry2x64(c,k) threefry2x64_R(threefry2x64_rounds, c, k) +#define threefry4x64(c,k) threefry4x64_R(threefry4x64_rounds, c, k) + +#if defined(__cplusplus) +#define _threefryNxWclass_tpl(NxW) \ +namespace r123{ \ +template<unsigned int ROUNDS> \ + struct Threefry##NxW##_R{ \ + typedef threefry##NxW##_ctr_t ctr_type; \ + typedef threefry##NxW##_key_t key_type; \ + typedef threefry##NxW##_key_t ukey_type; \ + static const R123_METAL_CONSTANT_ADDRESS_SPACE unsigned int rounds=ROUNDS; \ + inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key)){ \ + R123_STATIC_ASSERT(ROUNDS<=72, "threefry is only unrolled up to 72 rounds\n"); \ + return threefry##NxW##_R(ROUNDS, ctr, key); \ + } \ +}; \ + typedef Threefry##NxW##_R<threefry##NxW##_rounds> Threefry##NxW; \ +} // namespace r123 + +_threefryNxWclass_tpl(2x32) +_threefryNxWclass_tpl(4x32) +#if R123_USE_64BIT +_threefryNxWclass_tpl(2x64) +_threefryNxWclass_tpl(4x64) +#endif + +/* The _tpl macros don't quite work to do string-pasting inside comments. + so we just write out the boilerplate documentation four times... */ + +/** +@defgroup ThreefryNxW Threefry Classes and Typedefs + +The ThreefryNxW classes export the member functions, typedefs and +operator overloads required by a @ref CBRNG "CBRNG" class. + +As described in +<a href="http://dl.acm.org/citation.cfm?doid=2063405"><i>Parallel Random Numbers: As Easy as 1, 2, 3</i> </a>, +the Threefry family is closely related to the Threefish block cipher from +<a href="http://www.skein-hash.info/"> Skein Hash Function</a>. +Threefry is \b not suitable for cryptographic use. + +Threefry uses integer addition, bitwise rotation, xor and permutation of words to randomize its output. + +@class r123::Threefry2x32_R +@ingroup ThreefryNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Threefry round +function will be applied. + +As of September 2011, the authors know of no statistical flaws with +ROUNDS=13 or more for Threefry2x32. + +@typedef r123::Threefry2x32 +@ingroup ThreefryNxW + Threefry2x32 is equivalent to Threefry2x32_R<20>. With 20 rounds, + Threefry2x32 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. + +@class r123::Threefry2x64_R +@ingroup ThreefryNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Threefry round +function will be applied. + +In November 2011, the authors discovered that 13 rounds of +Threefry2x64 sequenced by strided, interleaved key and counter +increments failed a very long (longer than the default BigCrush +length) WeightDistrub test. At the same time, it was confirmed that +14 rounds passes much longer tests (up to 5x10^12 samples) of a +similar nature. The authors know of no statistical flaws with +ROUNDS=14 or more for Threefry2x64. + +@typedef r123::Threefry2x64 +@ingroup ThreefryNxW + Threefry2x64 is equivalent to Threefry2x64_R<20>. With 20 rounds, + Threefry2x64 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. + + + +@class r123::Threefry4x32_R +@ingroup ThreefryNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Threefry round +function will be applied. + +As of September 2011, the authors know of no statistical flaws with +ROUNDS=12 or more for Threefry4x32. + +@typedef r123::Threefry4x32 +@ingroup ThreefryNxW + Threefry4x32 is equivalent to Threefry4x32_R<20>. With 20 rounds, + Threefry4x32 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. + + + +@class r123::Threefry4x64_R +@ingroup ThreefryNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Threefry round +function will be applied. + +As of September 2011, the authors know of no statistical flaws with +ROUNDS=12 or more for Threefry4x64. + +@typedef r123::Threefry4x64 +@ingroup ThreefryNxW + Threefry4x64 is equivalent to Threefry4x64_R<20>. With 20 rounds, + Threefry4x64 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. +*/ + +#endif + +#endif diff --git a/ext/random123/include/Random123/u01fixedpt.h b/ext/random123/include/Random123/u01fixedpt.h new file mode 100644 index 0000000000000000000000000000000000000000..2058f8b57efcbb14a82f7d14def1066c24732dc9 --- /dev/null +++ b/ext/random123/include/Random123/u01fixedpt.h @@ -0,0 +1,200 @@ +/* +Copyright 2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _random123_ufixed01_dot_h_ +#define _random123_ufixed01_dot_h_ + +#include "features/compilerfeatures.h" + +/** @defgroup u01fixedpt The u01fixedpt conversion functions + + These functions convert unsigned W-bit integers to uniformly + spaced real values (float or double) between 0.0 and 1.0 with + mantissas of M bits. + + PLEASE THINK CAREFULLY BEFORE USING THESE FUNCTIONS. THEY MAY + NOT BE WHAT YOU WANT. YOU MAY BE MUCH BETTER SERVED BY THE + FUNCTIONS IN ./uniform.hpp. + + These functions produce a finite number *uniformly spaced* values + in the range from 0.0 to 1.0 with uniform probability. The price + of uniform spacing is that they may not utilize the entire space + of possible outputs. E.g., u01fixedpt_closed_open_32_24 will never + produce a non-zero value less than 2^-24, even though such values + are representable in single-precision floating point. + + There are 12 functions, corresponding to the following choices: + + - W = 32 or 64 + - M = 24 (float) or 53 (double) + - open0 or closed0 : whether the output is open or closed at 0.0 + - open1 or closed1 : whether the output is open or closed at 1.0 + + The W=64 M=24 cases are not implemented. To obtain an M=24 float + from a uint64_t, use a cast (possibly with right-shift and bitwise + and) to convert some of the bits of the uint64_t to a uint32_t and + then use u01fixedpt_x_y_32_float. Note that the 64-bit random integers + produced by the Random123 library are random in "all the bits", so + with a little extra effort you can obtain two floats this way -- + one from the high bits and one from the low bits of the 64-bit + value. + + If the output is open at one end, then the extreme + value (0.0 or 1.0) will never be returned. Conversely, if the output + is closed at one end, then the extreme value is a possible + return value. + + The values returned are as follows. All values are returned + with equal frequency, except as noted in the closed_closed case: + + closed_open: Let P=min(M,W) + there are 2^P possible output values: + {0, 1, 2, ..., 2^P-1}/2^P + + open_closed: Let P=min(M,W) + there are 2^P possible values: + {1, 2, ..., 2^P}/2^P + + open_open: Let P=min(M, W+1) + there are 2^(P-1) possible values: + {1, 3, 5, ..., 2^P-1}/2^P + + closed_closed: Let P=min(M, W-1) + there are 1+2^P possible values: + {0, 1, 2, ... 2^P}/2^P + The extreme values (0.0 and 1.0) are + returned with half the frequency of + all others. + + On x86 hardware, especially on 32bit machines, the use of + internal 80bit x87-style floating point may result in + 'bonus' precision, which may cause closed intervals to not + be really closed, i.e. the conversions below might not + convert UINT{32,64}_MAX to 1.0. This sort of issue is + likely to occur when storing the output of a u01fixedpt_*_32_float + function in a double, though one can imagine getting extra + precision artifacts when going from 64_53 as well. Other + artifacts may exist on some GPU hardware. The tests in + kat_u01_main.h try to expose such issues, but caveat emptor. + + @cond HIDDEN_FROM_DOXYGEN + */ + +/* Hex floats were standardized by C in 1999, but weren't standardized + by C++ until 2011. So, we're obliged to write out our constants in + decimal, even though they're most naturally expressed in binary. + We cross our fingers and hope that the compiler does the compile-time + constant arithmetic properly. +*/ +#define R123_0x1p_31f (1.f/(1024.f*1024.f*1024.f*2.f)) +#define R123_0x1p_24f (128.f*R123_0x1p_31f) +#define R123_0x1p_23f (256.f*R123_0x1p_31f) +#define R123_0x1p_32 (1./(1024.*1024.*1024.*4.)) +#define R123_0x1p_63 (2.*R123_0x1p_32*R123_0x1p_32) +#define R123_0x1p_53 (1024.*R123_0x1p_63) +#define R123_0x1p_52 (2048.*R123_0x1p_63) + +/** @endcond */ + +#ifndef R123_USE_U01_DOUBLE +#define R123_USE_U01_DOUBLE 1 +#endif + +#ifdef __cplusplus +extern "C"{ +#endif + +/* narrowing conversions: uint32_t to float */ +R123_CUDA_DEVICE R123_STATIC_INLINE float u01fixedpt_closed_closed_32_float(uint32_t i){ + /* N.B. we ignore the high bit, so output is not monotonic */ + return ((i&0x7fffffc0) + (i&0x40))*R123_0x1p_31f; /* 0x1.p-31f */ +} + +R123_CUDA_DEVICE R123_STATIC_INLINE float u01fixedpt_closed_open_32_float(uint32_t i){ + return (i>>8)*R123_0x1p_24f; /* 0x1.0p-24f; */ +} + +R123_CUDA_DEVICE R123_STATIC_INLINE float u01fixedpt_open_closed_32_float(uint32_t i){ + return (1+(i>>8))*R123_0x1p_24f; /* *0x1.0p-24f; */ +} + +R123_CUDA_DEVICE R123_STATIC_INLINE float u01fixedpt_open_open_32_float(uint32_t i){ + return (0.5f+(i>>9))*R123_0x1p_23f; /* 0x1.p-23f; */ +} + +#if R123_USE_U01_DOUBLE +/* narrowing conversions: uint64_t to double */ +R123_CUDA_DEVICE R123_STATIC_INLINE double u01fixedpt_closed_closed_64_double(uint64_t i){ + /* N.B. we ignore the high bit, so output is not monotonic */ + return ((i&R123_64BIT(0x7ffffffffffffe00)) + (i&0x200))*R123_0x1p_63; /* 0x1.p-63; */ +} + +R123_CUDA_DEVICE R123_STATIC_INLINE double u01fixedpt_closed_open_64_double(uint64_t i){ + return (i>>11)*R123_0x1p_53; /* 0x1.0p-53; */ +} + +R123_CUDA_DEVICE R123_STATIC_INLINE double u01fixedpt_open_closed_64_double(uint64_t i){ + return (1+(i>>11))*R123_0x1p_53; /* 0x1.0p-53; */ +} + +R123_CUDA_DEVICE R123_STATIC_INLINE double u01fixedpt_open_open_64_double(uint64_t i){ + return (0.5+(i>>12))*R123_0x1p_52; /* 0x1.0p-52; */ +} + +/* widening conversions: u32 to double */ +R123_CUDA_DEVICE R123_STATIC_INLINE double u01fixedpt_closed_closed_32_double(uint32_t i){ + /* j = i+(i&1) takes on 2^31+1 possible values with a 'trapezoid' distribution: + p_j = 1 0 2 0 2 .... 2 0 2 0 1 + j = 0 1 2 3 4 .... 2^32 + by converting to double *before* doing the add, we don't wrap the high bit. + */ + return (((double)(i&1)) + i)*R123_0x1p_32; /* 0x1.p-32; */ +} + +R123_CUDA_DEVICE R123_STATIC_INLINE double u01fixedpt_closed_open_32_double(uint32_t i){ + return i*R123_0x1p_32; /* 0x1.p-32; */ +} + +R123_CUDA_DEVICE R123_STATIC_INLINE double u01fixedpt_open_closed_32_double(uint32_t i){ + return (1.+i)*R123_0x1p_32; /* 0x1.p-32; */ +} + +R123_CUDA_DEVICE R123_STATIC_INLINE double u01fixedpt_open_open_32_double(uint32_t i){ + return (0.5+i)*R123_0x1p_32; /* 0x1.p-32; */ +} +#endif /* R123_USE_U01_DOUBLE */ + +#ifdef __cplusplus +} +#endif + +/** @} */ +#endif diff --git a/ext/random123/include/Random123/uniform.hpp b/ext/random123/include/Random123/uniform.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a815066ae8d302f7fba8c57c1feec56979fc5bd8 --- /dev/null +++ b/ext/random123/include/Random123/uniform.hpp @@ -0,0 +1,310 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef __r123_uniform_dot_hpp +#define __r123_uniform_dot_hpp + +/** @defgroup uniform Uniform distribution scalar conversion functions + +This file provides some simple functions that can be used to convert +integers of various widths to floats and doubles with various +characteristics. It can be used to generate real-valued, uniformly +distributed random variables from the random integers produced by +the Random123 CBRNGs. + +There are three templated functions: + + - u01: output is as dense as possible in (0,1}, never 0.0. May + return 1.0 if and only if the number of output mantissa bits + is less than the width of the input. + + - uneg11: output is as dense as possible in {-1,1}, never 0.0. May + return 1.0 or -1.0 if and only if the number of output mantissa bits + is less than the width of the input. + + - u01fixedpt: output is "fixed point", equispaced, open at both ends, + and is never 0.0, 0.5 nor 1.0. + +The behavior of u01 and uneg11 depend on the pre-processor symbol: +R123_UNIFORM_FLOAT_STORE. When #defined to a non-zero value, u01 +and uneg11 declare a volatile intermediate result, with the +intention of forcing architectures that have "extra bits" in their +floating point registers to more closely conform to IEEE +arithmetic. When compiled this way, u01 and uneg11 will be +significantly slower, as they will incur a memory write and read on +every call. Without it, they may fail the "known answer test" +implemented in ut_uniform_IEEEkat.cpp even though they perform +perfectly reasonable int to float conversions. We have used +this option to get 32-bit x86 to produce the same results as +64-bit x86-64 code, but we do not recommend it for normal +use. + +Three additional functions are defined when C++11 or newer is in use: + + - u01all + - uneg11all + - u01fixedptall + +These functions apply the corresponding conversion to every +element of their argument, which must be a staticly sized +array, e.g., an r123array or a std::array of an integer type. + +This file may not be as portable, and has not been tested as +rigorously as other files in the library, e.g., the generators. +Nevertheless, we hope it is useful and we encourage developers to +copy it and modify it for their own use. We invite comments and +improvements. +*/ + +#include <Random123/features/compilerfeatures.h> +#include <limits> +#if R123_USE_CXX11_TYPE_TRAITS +#include <type_traits> +#endif +#if __cplusplus >= 201103L +#include <array> +#endif + +namespace r123{ +/** +@{ +@cond HIDDEN_FROM_DOXYGEN +*/ + +#if R123_USE_CXX11_TYPE_TRAITS +using std::make_signed; +using std::make_unsigned; +#else +// Sigh... We could try to find another <type_traits>, e.g., from +// boost or TR1. Or we can do it ourselves in the r123 namespace. +// It's not clear which will cause less headache... +template <typename T> struct make_signed{}; +template <typename T> struct make_unsigned{}; +#define R123_MK_SIGNED_UNSIGNED(ST, UT) \ +template<> struct make_signed<ST>{ typedef ST type; }; \ +template<> struct make_signed<UT>{ typedef ST type; }; \ +template<> struct make_unsigned<ST>{ typedef UT type; }; \ +template<> struct make_unsigned<UT>{ typedef UT type; } + +R123_MK_SIGNED_UNSIGNED(int8_t, uint8_t); +R123_MK_SIGNED_UNSIGNED(int16_t, uint16_t); +R123_MK_SIGNED_UNSIGNED(int32_t, uint32_t); +R123_MK_SIGNED_UNSIGNED(int64_t, uint64_t); +#if R123_USE_GNU_UINT128 +R123_MK_SIGNED_UNSIGNED(__int128_t, __uint128_t); +#endif +#undef R123_MK_SIGNED_UNSIGNED +#endif + +#if defined(__CUDACC__) || defined(_LIBCPP_HAS_NO_CONSTEXPR) +// Amazing! cuda thinks numeric_limits::max() is a __host__ function, so +// we can't use it in a device function. +// +// The LIBCPP_HAS_NO_CONSTEXP test catches situations where the libc++ +// library thinks that the compiler doesn't support constexpr, but we +// think it does. As a consequence, the library declares +// numeric_limits::max without constexpr. This workaround should only +// affect a narrow range of compiler/library pairings. +// +// In both cases, we find max() by computing ~(unsigned)0 right-shifted +// by is_signed. +template <typename T> +R123_CONSTEXPR R123_STATIC_INLINE R123_CUDA_DEVICE T maxTvalue(){ + typedef typename make_unsigned<T>::type uT; + return (~uT(0)) >> std::numeric_limits<T>::is_signed; + } +#else +template <typename T> +R123_CONSTEXPR R123_STATIC_INLINE T maxTvalue(){ + return std::numeric_limits<T>::max(); +} +#endif +/** @endcond + @} + */ + +//! Return a uniform real value in (0, 1] +/** + @ingroup uniform + Input is a W-bit integer (signed or unsigned). It is cast to + a W-bit unsigned integer, multiplied by Ftype(2^-W) and added to + Ftype(2^(-W-1)). A good compiler should optimize it down to an + int-to-float conversion followed by a multiply and an add, which + might be fused, depending on the architecture. + + If the input is a uniformly distributed integer, and if Ftype + arithmetic follows IEEE754 round-to-nearest rules, then the + result is a uniformly distributed floating point number in (0, 1]. + +- The result is never exactly 0.0. +- The smallest value returned is 2^-(W-1). +- Let M be the number of mantissa bits in Ftype (typically 24 or 53). + - If W>M then the largest value retured is 1.0. + - If W<=M then the largest value returned is Ftype(1.0 - 2^(-W-1)). +*/ +template <typename Ftype, typename Itype> +R123_CUDA_DEVICE R123_STATIC_INLINE Ftype u01(Itype in){ + typedef typename make_unsigned<Itype>::type Utype; + R123_CONSTEXPR Ftype factor = Ftype(1.)/(maxTvalue<Utype>() + Ftype(1.)); + R123_CONSTEXPR Ftype halffactor = Ftype(0.5)*factor; +#if R123_UNIFORM_FLOAT_STORE + volatile Ftype x = Utype(in)*factor; return x+halffactor; +#else + return Utype(in)*factor + halffactor; +#endif +} + +//! Return a signed value in [-1,1] +/** + @ingroup uniform + The argument is converted to a W-bit signed integer, multiplied by Ftype(2^-(W-1)) and + then added to Ftype(2^-W). A good compiler should optimize + it down to an int-to-float conversion followed by a multiply and + an add, which might be fused, depending on the architecture. + + If the input is a uniformly distributed integer, and if Ftype + arithmetic follows IEEE754 round-to-nearest rules, then the + output is a uniformly distributed floating point number in [-1, 1]. + +- The result is never exactly 0.0. +- The smallest absolute value returned is 2^-W +- Let M be the number of mantissa bits in Ftype. + - If W>M then the largest value retured is 1.0 and the smallest is -1.0. + - If W<=M then the largest value returned is the Ftype(1.0 - 2^-W) + and the smallest value returned is -Ftype(1.0 - 2^-W). +*/ +template <typename Ftype, typename Itype> +R123_CUDA_DEVICE R123_STATIC_INLINE Ftype uneg11(Itype in){ + typedef typename make_signed<Itype>::type Stype; + R123_CONSTEXPR Ftype factor = Ftype(1.)/(maxTvalue<Stype>() + Ftype(1.)); + R123_CONSTEXPR Ftype halffactor = Ftype(0.5)*factor; +#if R123_UNIFORM_FLOAT_STORE + volatile Ftype x = Stype(in)*factor; return x+halffactor; +#else + return Stype(in)*factor + halffactor; +#endif +} + +//! Return a value in (0,1) chosen from a set of equally spaced fixed-point values +/** + @ingroup uniform + Let: + - W = width of Itype, e.g., 32 or 64, regardless of signedness. + - M = mantissa bits of Ftype, e.g., 24, 53 or 64 + - B = min(M, W) + + Then the 2^(B-1) possible output values are: 2^-B*{1, 3, 5, ..., 2^B - 1} + + The smallest output is: 2^-B + + The largest output is: 1 - 2^-B + + The output is never exactly 0.0, nor 0.5, nor 1.0. + + The 2^(B-1) possible outputs: + - are equally likely, + - are uniformly spaced by 2^-(B-1), + - are balanced around 0.5 +*/ +template <typename Ftype, typename Itype> +R123_CUDA_DEVICE R123_STATIC_INLINE Ftype u01fixedpt(Itype in){ + typedef typename make_unsigned<Itype>::type Utype; + R123_CONSTEXPR int excess = std::numeric_limits<Utype>::digits - std::numeric_limits<Ftype>::digits; + if(excess>=0){ + R123_CONSTEXPR int ex_nowarn = (excess>=0) ? excess : 0; + R123_CONSTEXPR Ftype factor = Ftype(1.)/(Ftype(1.) + ((maxTvalue<Utype>()>>ex_nowarn))); + return (1 | (Utype(in)>>ex_nowarn)) * factor; + }else + return u01<Ftype>(in); +} + +#if R123_USE_CXX11_STD_ARRAY + +//! Apply u01 to every item in an r123array, returning a std::array +/** @ingroup uniform + * Only in C++11 and newer. + * The argument type may be any integer collection with a constexpr static_size member, + * e.g., an r123array or a std::array of an integer type. + */ +template <typename Ftype, typename CollType> +static inline +std::array<Ftype, CollType::static_size> u01all(CollType in) +{ + std::array<Ftype, CollType::static_size> ret; + size_t i=0; + for(auto e : in){ + ret[i++] = u01<Ftype>(e); + } + return ret; +} + +//! Apply uneg11 to every item in an r123array, returning a std::array +/** @ingroup uniform + * Only in C++11 and newer. + * The argument type may be any integer collection with a constexpr static_size member, + * e.g., an r123array or a std::array of an integer type. + */ +template <typename Ftype, typename CollType> +static inline +std::array<Ftype, CollType::static_size> uneg11all(CollType in) +{ + std::array<Ftype, CollType::static_size> ret; + size_t i=0; + for(auto e : in){ + ret[i++] = uneg11<Ftype>(e); + } + return ret; +} + +//! Apply u01fixedpt to every item in an r123array, returning a std::array +/** @ingroup uniform + * Only in C++11 and newer. + * The argument type may be any integer collection with a constexpr static_size member, + * e.g., an r123array or a std::array of an integer type. +*/ +template <typename Ftype, typename CollType> +static inline +std::array<Ftype, CollType::static_size> u01fixedptall(CollType in) +{ + std::array<Ftype, CollType::static_size> ret; + size_t i=0; + for(auto e : in){ + ret[i++] = u01fixedpt<Ftype>(e); + } + return ret; +} +#endif // __cplusplus >= 201103L + +} // namespace r123 + +#endif + diff --git a/test/unit/test_morph_expr.cpp b/test/unit/test_morph_expr.cpp index 60d104e992458d8b64e0ad760c0f0e17b4d3317f..5a06a27586ea7c5e24b260f12830b9b3ea21ee39 100644 --- a/test/unit/test_morph_expr.cpp +++ b/test/unit/test_morph_expr.cpp @@ -24,6 +24,22 @@ namespace arb { } } +::testing::AssertionResult mlocation_eq(mlocation a, mlocation b) { + if (a.branch!=b.branch) { + return ::testing::AssertionFailure() + << "cables " << a << " and " << b << " differ"; + } + + using FP = testing::internal::FloatingPoint<double>; + if (FP(a.pos).AlmostEquals(FP(b.pos))) { + return ::testing::AssertionSuccess(); + } + else { + return ::testing::AssertionFailure() + << "mlocations " << a << " and " << b << " differ"; + } +} + ::testing::AssertionResult cable_eq(mcable a, mcable b) { if (a.branch!=b.branch) { return ::testing::AssertionFailure() @@ -54,6 +70,20 @@ namespace arb { return ::testing::AssertionSuccess(); } +::testing::AssertionResult mloctionlist_eq(const mlocation_list& as, const mlocation_list& bs) { + if (as.size()!=bs.size()) { + return ::testing::AssertionFailure() + << "cablelists " << as << " and " << bs << " differ"; + } + + for (auto i: util::count_along(as)) { + auto result = mlocation_eq(as[i], bs[i]); + if (!result) return ::testing::AssertionFailure() + << "mlocation lists " << as << " and " << bs << " differ"; + } + return ::testing::AssertionSuccess(); +} + TEST(region, expr_repn) { using util::to_string; @@ -268,9 +298,91 @@ TEST(locset, thingify) { // In the absence of a spherical root, there is no branch 4. EXPECT_THROW(thingify(begb4, mp), no_such_branch); } + { + mprovider mp(morphology(sm, false)); + + auto all = reg::all(); + auto ls0 = thingify(ls::uniform(all, 0, 9, 12), mp); + auto ls1 = thingify(ls::uniform(all, 0, 9, 12), mp); + auto ls2 = thingify(ls::uniform(all, 10, 19, 12), mp); + auto ls3 = thingify(ls::uniform(all, 0, 9, 13), mp); + auto ls4 = thingify(ls::uniform(all, 5, 6, 12), mp); + auto ls5 = thingify(ls::uniform(all, 2, 5, 12), mp); + auto ls6 = thingify(ls::uniform(all, 5, 11, 12), mp); + + EXPECT_EQ(ls0, ls1); + + bool found_none = true; + for (auto l: ls2) { + auto it = std::find(ls0.begin(), ls0.end(), l); + if (it != ls0.end()) { + found_none = false; + } + } + EXPECT_TRUE(found_none); + + found_none = true; + for (auto l: ls3) { + auto it = std::find(ls0.begin(), ls0.end(), l); + if (it != ls0.end()) { + found_none = false; + } + } + EXPECT_TRUE(found_none); + + bool found_all = true; + for (auto l: ls4) { + auto it = std::find(ls0.begin(), ls0.end(), l); + if (it == ls0.end()) { + found_all = false; + } + } + EXPECT_TRUE(found_all); + + int found = 0; + for (auto l: ls5) { + auto it = std::find(ls4.begin(), ls4.end(), l); + if (it != ls4.end()) found++; + } + EXPECT_TRUE(found == 1); + + found = 0; + for (auto l: ls6) { + auto it = std::find(ls4.begin(), ls4.end(), l); + if (it != ls4.end()) found++; + } + EXPECT_TRUE(found == 2); + } + { + mprovider mp(morphology(sm, false)); + auto sub_reg = join(reg::cable(0, 0.2, 0.7), reg::cable(1, 0.1, 1), reg::cable(3, 0.5, 0.6)); + + auto ls0 = thingify(ls::uniform(sub_reg, 0, 10000, 72), mp); + for (auto l: ls0) { + switch(l.branch) { + case 0: { + if (l.pos < 0.2 || l.pos > 0.7) FAIL(); + break; + } + case 1: { + if (l.pos < 0.1 || l.pos > 1) FAIL(); + break; + } + case 3: { + if (l.pos < 0.5 || l.pos > 0.6) FAIL(); + break; + } + default: { + FAIL(); + break; + } + } + SUCCEED(); + } + } } -TEST(region, thingify) { +TEST(region, thingify_simple_morphologies) { using pvec = std::vector<msize_t>; using svec = std::vector<msample>; using cl = mcable_list; @@ -352,16 +464,45 @@ TEST(region, thingify) { sample_tree sm(samples, parents); mprovider mp(morphology(sm, true)); + using ls::location; using reg::tagged; + using reg::distal_interval; + using reg::proximal_interval; using reg::branch; + using reg::cable; using reg::all; + locset mid0_ = location(0,0.5); + locset start1_ = location(1,0); + locset end1_ = location(1,1); + + auto reg0_ = distal_interval(start1_, 45); + auto reg1_ = distal_interval(mid0_, 74); + auto reg2_ = proximal_interval(end1_, 45); + auto reg3_ = proximal_interval(end1_, 91); + auto reg4_ = distal_interval(end1_, 0); + auto reg5_ = distal_interval(start1_, 0); + auto reg6_ = proximal_interval(start1_, 0); + EXPECT_EQ(thingify(tagged(1), mp), (mcable_list{{0,0,1}})); EXPECT_EQ(thingify(tagged(2), mp), (mcable_list{{2,0,1}})); EXPECT_EQ(thingify(tagged(3), mp), (mcable_list{{1,0,1}})); EXPECT_EQ(thingify(join(tagged(1), tagged(2), tagged(3)), mp), (mcable_list{{0,0,1}, {1,0,1}, {2,0,1}})); EXPECT_EQ(thingify(join(tagged(1), tagged(2), tagged(3)), mp), thingify(all(), mp)); + EXPECT_EQ(thingify(reg0_, mp), (mcable_list{{1,0,0.5}})); + EXPECT_EQ(thingify(reg1_, mp), (mcable_list{{0,0.5,1}, {1,0,0.8}, {2,0,0.8}})); + EXPECT_EQ(thingify(reg2_, mp), (mcable_list{{1,0.5,1}})); + EXPECT_EQ(thingify(reg3_, mp), (mcable_list{{0, 0.75, 1}, {1,0,1}})); + EXPECT_EQ(thingify(reg4_, mp), (mcable_list{{1,1,1}})); + EXPECT_EQ(thingify(reg5_, mp), (mcable_list{{0,1,1}})); + EXPECT_EQ(thingify(reg6_, mp), (mcable_list{{0,1,1}})); } +} + +TEST(region, thingify_moderate_morphologies) { + using pvec = std::vector<msize_t>; + using svec = std::vector<msample>; + using cl = mcable_list; // Test multi-level morphologies. // @@ -382,21 +523,28 @@ TEST(region, thingify) { { pvec parents = {mnpos, 0, 1, 0, 3, 4, 4, 6}; svec samples = { - {{ 0, 0, 0, 2}, 1}, - {{ 10, 0, 0, 2}, 3}, - {{100, 0, 0, 2}, 3}, - {{ 0, 10, 0, 2}, 2}, - {{ 0,100, 0, 2}, 2}, + {{ 0, 0, 0, 1}, 1}, + {{ 10, 0, 0, 1}, 3}, + {{100, 0, 0, 3}, 3}, + {{ 0, 10, 0, 1}, 2}, + {{ 0,100, 0, 5}, 2}, {{100,100, 0, 2}, 4}, - {{ 0,200, 0, 2}, 3}, - {{ 0,300, 0, 2}, 3}, + {{ 0,200, 0, 1}, 3}, + {{ 0,300, 0, 3}, 3}, }; sample_tree sm(samples, parents); // Without spherical root mprovider mp(morphology(sm, false)); + using ls::location; using reg::tagged; + using reg::distal_interval; + using reg::proximal_interval; + using reg::radius_lt; + using reg::radius_le; + using reg::radius_gt; + using reg::radius_ge; using reg::branch; using reg::all; using reg::cable; @@ -419,11 +567,10 @@ TEST(region, thingify) { mcable b3_{3,0,1}; cl all_ = {b0_,b1_,b2_,b3_}; - mcable end1_{1,1,1}; - mcable root_{0,0,0}; + mcable c_end1_{1,1,1}; + mcable c_root_{0,0,0}; EXPECT_EQ(thingify(all(), mp), all_); - EXPECT_EQ(thingify(soma, mp), empty_); EXPECT_EQ(thingify(axon, mp), (cl{b1_})); EXPECT_EQ(thingify(dend, mp), (cl{b0_,b3_})); EXPECT_EQ(thingify(apic, mp), (cl{b2_})); @@ -432,9 +579,60 @@ TEST(region, thingify) { // Test that intersection correctly generates zero-length cables at // parent-child interfaces. - EXPECT_EQ(thingify(intersect(apic, dend), mp), (cl{end1_})); - EXPECT_EQ(thingify(intersect(apic, axon), mp), (cl{end1_})); - EXPECT_EQ(thingify(intersect(axon, dend), mp), (cl{root_, end1_})); + EXPECT_EQ(thingify(intersect(apic, dend), mp), (cl{c_end1_})); + EXPECT_EQ(thingify(intersect(apic, axon), mp), (cl{c_end1_})); + EXPECT_EQ(thingify(intersect(axon, dend), mp), (cl{c_root_, c_end1_})); + + // Test distal and proximal interavls + auto start0_ = location(0, 0 ); + auto quar_1_ = location(1, 0.25); + auto mid1_ = location(1, 0.5 ); + auto end1_ = location(1, 1 ); + auto mid2_ = location(2, 0.5 ); + auto end2_ = location(2, 1 ); + auto mid3_ = location(3, 0.5 ); + auto loc_3_0_ = location(3, 0.4 ); + auto loc_3_1_ = location(3, 0.65); + auto mid_3_ = location(3, 0.5 ); + auto reg_a_ = join(cable(0,0.1,0.4), cable(2,0,1), cable(3,0.1,0.4)); + auto reg_b_ = join(cable(0,0.1,0.4), cable(2,0,1), cable(3,0.1,0.3)); + auto reg_c_ = join(cable(0,0,0.7), cable(2,0,0.5), cable(3,0.1,0.4), cable(3,0.9,1)); + auto reg_d_ = join(cable(0,0,0.7), cable(2,0,0.5), cable(3,0.1,0.9)); + + // Distal from point and/or interval + EXPECT_TRUE(cablelist_eq(thingify(distal_interval(start0_, 1000), mp), (mcable_list{{0,0,1}}))); + EXPECT_TRUE(cablelist_eq(thingify(distal_interval(quar_1_, 150), mp), (mcable_list{{1,0.25,1}, {2,0,0.75}, {3,0,0.375}}))); + EXPECT_TRUE(cablelist_eq(thingify(distal_interval(mid1_, 1000), mp), (mcable_list{{1,0.5,1}, {2,0,1}, {3,0,1}}))); + EXPECT_TRUE(cablelist_eq(thingify(distal_interval(mid1_, 150), mp), (mcable_list{{1,0.5,1}, {2,0,1}, {3,0,0.5}}))); + EXPECT_TRUE(cablelist_eq(thingify(distal_interval(end1_, 100), mp), (mcable_list{{2,0,1},{3,0,0.5}}))); + EXPECT_TRUE(cablelist_eq(thingify(distal_interval(join(quar_1_, mid1_), 150), mp), (mcable_list{{1,0.25,1}, {2,0,1}, {3,0,0.5}}))); + EXPECT_TRUE(cablelist_eq(thingify(distal_interval(join(quar_1_, loc_3_1_), 150), mp), (mcable_list{{1,0.25,1}, {2,0,0.75}, {3,0,0.375}, {3,0.65,1}}))); + EXPECT_TRUE(cablelist_eq(thingify(distal_interval(join(quar_1_, loc_3_1_), 150), mp), (mcable_list{{1,0.25,1}, {2,0,0.75}, {3,0,0.375}, {3,0.65,1}}))); + + // Proximal from point and/or interval + EXPECT_TRUE(cablelist_eq(thingify(proximal_interval(mid3_, 100), mp), (mcable_list{{3,0,0.5}}))); + EXPECT_TRUE(cablelist_eq(thingify(proximal_interval(mid3_, 150), mp), (mcable_list{{1,0.5,1}, {3,0,0.5}}))); + EXPECT_TRUE(cablelist_eq(thingify(proximal_interval(end2_, 150), mp), (mcable_list{{1,0.5,1}, {2,0,1}}))); + EXPECT_TRUE(cablelist_eq(thingify(proximal_interval(end2_, 500), mp), (mcable_list{{1,0,1}, {2,0,1}}))); + EXPECT_TRUE(cablelist_eq(thingify(proximal_interval(loc_3_0_, 100), mp), (mcable_list{{1,0.8,1}, {3,0,0.4}}))); + EXPECT_TRUE(cablelist_eq(thingify(proximal_interval(join(loc_3_0_, mid2_), 120), mp), (mcable_list{{1,0.3,1}, {2,0,0.5}, {3, 0, 0.4}}))); + + // Test radius_lt and radius_gt + EXPECT_TRUE(cablelist_eq(thingify(radius_lt(all(), 2), mp), (mcable_list{{0,0,0.55}, {1,0,0.325}, {3,0.375,0.75}}))); + EXPECT_TRUE(cablelist_eq(thingify(radius_lt(all(), 3), mp), (mcable_list{{0,0,1}, {1,0,0.55}, {2,6.0/9.0,1}, {3,0.25,1}}))); + EXPECT_TRUE(cablelist_eq(thingify(radius_gt(all(), 2), mp), (mcable_list{{0,0.55,1}, {1,0.325,1}, {2,0,1}, {3,0,0.375}, {3,0.75,1}}))); + EXPECT_TRUE(cablelist_eq(thingify(radius_gt(all(), 3), mp), (mcable_list{{1,0.55,1}, {2,0,6.0/9.0}, {3,0,0.25}}))); + + EXPECT_TRUE(cablelist_eq(thingify(radius_le(all(), 2), mp), (mcable_list{{0,0,0.55}, {1,0,0.325}, {2,1,1}, {3,0.375,0.75}}))); + EXPECT_TRUE(cablelist_eq(thingify(radius_le(all(), 3), mp), (mcable_list{{0,0,1}, {1,0,0.55}, {2,6.0/9.0,1}, {3,0.25,1}}))); + EXPECT_TRUE(cablelist_eq(thingify(radius_ge(all(), 2), mp), (mcable_list{{0,0.55,1}, {1,0.325,1}, {2,0,1}, {3,0,0.375}, {3,0.75,1}}))); + EXPECT_TRUE(cablelist_eq(thingify(radius_ge(all(), 3), mp), (mcable_list{{1,0.55,1}, {2,0,6.0/9.0}, {3,0,0.25}}))); + + EXPECT_TRUE(cablelist_eq(thingify(radius_lt(reg_a_, 2), mp), (mcable_list{{0,0.1,0.4},{3,0.375,0.4}}))); + EXPECT_TRUE(cablelist_eq(thingify(radius_gt(reg_a_, 2), mp), (mcable_list{{2,0,1},{3,0.1,0.375}}))); + EXPECT_TRUE(cablelist_eq(thingify(radius_lt(reg_b_, 2), mp), (mcable_list{{0,0.1,0.4}}))); + EXPECT_TRUE(cablelist_eq(thingify(radius_gt(reg_c_, 2), mp), (mcable_list{{0,0.55,0.7},{2,0,0.5},{3,0.1,0.375},{3,0.9,1}}))); + EXPECT_TRUE(cablelist_eq(thingify(radius_gt(reg_d_, 2), mp), (mcable_list{{0,0.55,0.7},{2,0,0.5},{3,0.1,0.375},{3,0.75,0.9}}))); // Test some more interesting intersections and unions. @@ -570,6 +768,11 @@ TEST(region, thingify) { EXPECT_EQ(thingify(join(lhs, rhs), mp), ror); } +} +TEST(region, thingify_complex_morphologies) { + using pvec = std::vector<msize_t>; + using svec = std::vector<msample>; + using cl = mcable_list; { pvec parents = {mnpos, 0, 1, 0, 3, 4, 5, 5, 7, 7, 4, 10}; svec samples = { @@ -588,8 +791,6 @@ TEST(region, thingify) { }; sample_tree sm(samples, parents); auto m = morphology(sm, false); - std::cout << m.branch_parent(7); - { auto in = cl{{0,0,0},{1,0,0.5},{1,1,1},{2,0,1},{2,1,1},{3,1,1},{4,0,1},{5,1,1},{7,0,1}}; auto out = reg::remove_covered_points(in, m); @@ -604,5 +805,104 @@ TEST(region, thingify) { auto expected = cl{{1,0,0.5},{3,1,1},{4,0,1},{5,1,1},{7,0,1}}; EXPECT_TRUE(cablelist_eq(out, expected)); } + { + mprovider mp(m); + using reg::cable; + using ls::most_distal; + using ls::most_proximal; + + auto reg_a_ = join(cable(0,0.1,0.4), cable(0,0,0.9), cable(1,0.1,0.4)); + auto reg_b_ = join(cable(0,0.1,0.4), cable(0,0,0.9), cable(1,0.1,0.4), cable(1,0.2,0.5)); + auto reg_c_ = join(cable(0,0.1,0.4), cable(0,0,0.9), cable(1,0.1,0.4), cable(2,0.2,0.5)); + auto reg_d_ = join(cable(2,0,0.9), cable(3,0.1,0.1), cable(4,0.1,0.6)); + auto reg_e_ = join(cable(2,0,0.9), cable(4,0.1,0.1), cable(5,0.1,0.6)); + auto reg_f_ = join(cable(7,0,1), cable(2,0,0.9), cable(4,0.1,0.1), cable(5,0.1,0.6)); + + EXPECT_TRUE(mloctionlist_eq(thingify(most_distal(reg_a_), mp), mlocation_list{{0,0.9},{1,0.4}})); + EXPECT_TRUE(mloctionlist_eq(thingify(most_distal(reg_b_), mp), mlocation_list{{0,0.9},{1,0.5}})); + EXPECT_TRUE(mloctionlist_eq(thingify(most_distal(reg_c_), mp), mlocation_list{{0,0.9},{2,0.5}})); + EXPECT_TRUE(mloctionlist_eq(thingify(most_distal(reg_d_), mp), mlocation_list{{3,0.1},{4,0.6}})); + EXPECT_TRUE(mloctionlist_eq(thingify(most_distal(reg_e_), mp), mlocation_list{{5,0.6}})); + EXPECT_TRUE(mloctionlist_eq(thingify(most_distal(reg_f_), mp), mlocation_list{{5,0.6},{7,1}})); + + EXPECT_TRUE(mloctionlist_eq(thingify(most_proximal(reg_a_), mp), mlocation_list{{0,0}})); + EXPECT_TRUE(mloctionlist_eq(thingify(most_proximal(reg_b_), mp), mlocation_list{{0,0}})); + EXPECT_TRUE(mloctionlist_eq(thingify(most_proximal(reg_c_), mp), mlocation_list{{0,0}})); + EXPECT_TRUE(mloctionlist_eq(thingify(most_proximal(reg_d_), mp), mlocation_list{{2,0}})); + EXPECT_TRUE(mloctionlist_eq(thingify(most_proximal(reg_e_), mp), mlocation_list{{2,0}})); + EXPECT_TRUE(mloctionlist_eq(thingify(most_proximal(reg_f_), mp), mlocation_list{{2,0}})); + } + } + { + pvec parents = {mnpos, 0, 1, 1, 2, 3, 0, 6, 7, 8, 7}; + svec samples = { + {{ 0, 10, 10, 1}, 1}, + {{ 0, 30, 30, 1}, 2}, + {{ 0, 60,-20, 1}, 2}, + {{ 0, 90, 70, 1}, 2}, + {{ 0, 80,-10, 1}, 2}, + {{ 0,100,-40, 1}, 2}, + {{ 0,-50,-50, 1}, 2}, + {{ 0, 20,-30, 2}, 2}, + {{ 0, 40,-80, 2}, 2}, + {{ 0,-30,-80, 3}, 2}, + {{ 0, 90,-70, 5}, 2} + }; + sample_tree sm(samples, parents); + + // Without spherical root + mprovider mp(morphology(sm, false)); + + using reg::all; + using reg::z_dist_from_soma_lt; + using reg::z_dist_from_soma_le; + using reg::z_dist_from_soma_gt; + using reg::z_dist_from_soma_ge; + using reg::cable; + + // Test projection + EXPECT_TRUE(cablelist_eq(thingify(z_dist_from_soma_lt(0), mp), (mcable_list{}))); + EXPECT_TRUE(cablelist_eq(thingify(z_dist_from_soma_ge(0), mp), thingify(all(), mp))); + + EXPECT_TRUE(cablelist_eq(thingify(z_dist_from_soma_le(100), mp), thingify(all(), mp))); + EXPECT_TRUE(cablelist_eq(thingify(z_dist_from_soma_gt(100), mp), (mcable_list{}))); + + EXPECT_TRUE(cablelist_eq(thingify(z_dist_from_soma_le(90), mp), thingify(all(), mp))); + EXPECT_TRUE(cablelist_eq(thingify(z_dist_from_soma_gt(90), mp), (mcable_list{}))); + + EXPECT_TRUE(cablelist_eq(thingify(z_dist_from_soma_lt(20), mp), + (mcable_list{{0,0,1}, + {1,0,0.578250901781922829}, + {2,0.61499300915417734997,0.8349970039232188642}, + {3,0,0.179407353580315756} + }))); + EXPECT_TRUE(cablelist_eq(thingify(z_dist_from_soma_ge(20), mp), + (mcable_list{{0,1,1}, + {1,0.578250901781922829,1}, + {2,0,0.61499300915417734997}, + {2,0.8349970039232188642,1}, + {3,0.179407353580315756,1}, + {4,0,1}, + {5,0,1} + }))); + EXPECT_TRUE(cablelist_eq(thingify(join(z_dist_from_soma_lt(20), z_dist_from_soma_ge(20)), mp), thingify(all(), mp))); + + EXPECT_TRUE(cablelist_eq(thingify(z_dist_from_soma_le(50), mp), + (mcable_list{{0,0,1}, + {1,0,1}, + {2,0,0.2962417607888518767}, + {2,0.4499900130773962142,1}, + {3,0,0.4485183839507893905}, + {3,0.7691110303704736343,1}, + {4,0,0.0869615364994152821}, + {5,0,0.25} + }))); + EXPECT_TRUE(cablelist_eq(thingify(z_dist_from_soma_gt(50), mp), + (mcable_list{{2,0.2962417607888518767,0.4499900130773962142}, + {3,0.4485183839507893905,0.7691110303704736343}, + {4,0.0869615364994152821,1}, + {5,0.25,1}}))); + + EXPECT_TRUE(cablelist_eq(thingify(join(z_dist_from_soma_le(50), z_dist_from_soma_gt(50)), mp), thingify(all(), mp))); } }