From 9df68703e69f5b302a4eb3cf656943bf2f71711d Mon Sep 17 00:00:00 2001 From: Sam Yates <yates@cscs.ch> Date: Wed, 15 Mar 2017 12:27:13 +0100 Subject: [PATCH] Simple `net_receive` device kenels (#193) Fixes #183 Use a device kernel for net_receive state updates. Note: very naive, but gives about a 30% speed up on the 1000 cell miniapp test. All the fun optimization will end up under issue #184. This also incorporates PR #192, so this PR will be amended if that one is rejected. --- modcc/cudaprinter.cpp | 72 +++++++++++++++++++------------ modcc/cudaprinter.hpp | 2 +- src/memory/device_coordinator.hpp | 4 +- 3 files changed, 49 insertions(+), 29 deletions(-) diff --git a/modcc/cudaprinter.cpp b/modcc/cudaprinter.cpp index d151c2d0..904bf7b3 100644 --- a/modcc/cudaprinter.cpp +++ b/modcc/cudaprinter.cpp @@ -1,6 +1,5 @@ #include <algorithm> -#include "cprinter.hpp" // needed for printing net_receive method #include "cudaprinter.hpp" #include "lexer.hpp" #include "options.hpp" @@ -121,7 +120,7 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o) for(auto const &var : m.symbols()) { if (var.second->kind()==symbolKind::procedure && is_in(var.second->is_procedure()->kind(), - {procedureKind::normal, procedureKind::api})) + {procedureKind::normal, procedureKind::api, procedureKind::net_receive})) { var.second->accept(this); } @@ -434,22 +433,15 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o) else if( var.second->kind()==symbolKind::procedure && var.second->is_procedure()->kind()==procedureKind::net_receive) { + // Simplest net_receive implementation forwards a single update + // to a GPU kernel. auto proc = var.second->is_procedure(); auto name = proc->name(); text_.add_line("void " + name + "(int i_, value_type weight) {"); text_.increase_indentation(); - - // Print the body of the net_receive block. - // Use the same body as would be generated with the cprinter. - // This is not omptimal, because each read and write will require - // a copy between host and device memory, so we will need a - // GPU-specific implementation - auto cprinter = CPrinter(*module_); - cprinter.clear_text(); - cprinter.set_gutter(text_.get_gutter()); - proc->body()->accept(&cprinter); - text_ << cprinter.text(); - + text_.add_line( + "kernels::" + name + "<value_type, size_type>" + + "<<<1, 1>>>(param_pack_, i_, weight);"); text_.decrease_indentation(); text_.add_line("}"); text_.add_line(); @@ -669,22 +661,48 @@ void CUDAPrinter::visit(ProcedureExpression *e) { e->location()); } - // print prototype - print_procedure_prototype(e); - text_.end_line(" {"); + if(e->kind() != procedureKind::net_receive) { + // print prototype + print_procedure_prototype(e); + text_.end_line(" {"); - // print body - increase_indentation(); + // print body + increase_indentation(); - text_.add_line("using value_type = T;"); - text_.add_line(); + text_.add_line("using value_type = T;"); + text_.add_line(); - e->body()->accept(this); + e->body()->accept(this); - // close up - decrease_indentation(); - text_.add_line("}"); - text_.add_line(); + // close up + decrease_indentation(); + text_.add_line("}"); + text_.add_line(); + } + else { + // net_receive() kernel is a special case, not covered by APIMethod visit. + text_.add_gutter() << "template <typename T, typename I>\n"; + text_.add_line( "__global__"); + text_.add_gutter() << "void " << e->name() + << "(" << module_->name() << "_ParamPack<T,I> params_, " + << "I i_, T weight) {"; + text_.add_line(); + increase_indentation(); + + text_.add_line("using value_type = T;"); + text_.add_line("using iarray = I;"); + text_.add_line(); + + text_.add_line("if (threadIdx.x || blockIdx.x) return;"); + text_.add_line("auto tid_ = i_;"); + text_.add_line("auto gid_ __attribute__((unused)) = params_.ni[tid_];"); + + print_APIMethod_body(e); + + decrease_indentation(); + text_.add_line("}"); + text_.add_line(); + } } void CUDAPrinter::visit(APIMethod *e) { @@ -725,7 +743,7 @@ void CUDAPrinter::visit(APIMethod *e) { text_.add_line(); } -void CUDAPrinter::print_APIMethod_body(APIMethod* e) { +void CUDAPrinter::print_APIMethod_body(ProcedureExpression* e) { // load indexes of ion channels auto uses_k = false; auto uses_na = false; diff --git a/modcc/cudaprinter.hpp b/modcc/cudaprinter.hpp index 0b2cb004..0f4acbbb 100644 --- a/modcc/cudaprinter.hpp +++ b/modcc/cudaprinter.hpp @@ -97,7 +97,7 @@ private: return module_->kind() == moduleKind::point; } - void print_APIMethod_body(APIMethod* e); + void print_APIMethod_body(ProcedureExpression* e); void print_procedure_prototype(ProcedureExpression *e); std::string index_string(Symbol *e); diff --git a/src/memory/device_coordinator.hpp b/src/memory/device_coordinator.hpp index 92ede76b..05fa27b9 100644 --- a/src/memory/device_coordinator.hpp +++ b/src/memory/device_coordinator.hpp @@ -290,7 +290,9 @@ public: // fill memory void set(view_type &rng, value_type value) { - gpu::fill<value_type>(rng.data(), value, rng.size()); + if (rng.size()) { + gpu::fill<value_type>(rng.data(), value, rng.size()); + } } // generate reference objects for a raw pointer. -- GitLab