37 #include "impl/Kokkos_Timer.hpp" 
   40 #include <sys/types.h> 
   47 template <
typename ViewTypeA, 
typename ViewTypeB, 
typename ViewTypeC>
 
   69                 const ViewTypeB& b_arg,
 
   70                 const ViewTypeC& c_arg) :
 
   71     A(A_arg), 
b(b_arg), 
c(c_arg), 
n(
A.extent(1))
 
   88 template <
typename ViewTypeA, 
typename ViewTypeB, 
typename ViewTypeC>
 
  111                      const ViewTypeB& b_arg,
 
  112                      const ViewTypeC& c_arg) :
 
  113     A(A_arg), 
b(b_arg), 
c(c_arg), 
n(
A.extent(1)), 
p(
A.extent(2)-1)
 
  123       c(i,p) += 
A(i,j,p)*
b(j,p);
 
  125         c(i,k) += 
A(i,j,k)*
b(j,p) + 
A(i,j,p)*
b(j,k);
 
  134 template <
typename ViewTypeA, 
typename ViewTypeB, 
typename ViewTypeC,
 
  158                        const ViewTypeB& b_arg,
 
  159                        const ViewTypeC& c_arg) :
 
  160     A(A_arg), 
b(b_arg), 
c(c_arg), 
n(
A.extent(1)), 
p(
A.extent(2)-1)
 
  176         t[k] += 
A(i,j,k)*bv + av*
b(j,k);
 
  189 template <
typename ViewTypeA, 
typename ViewTypeB, 
typename ViewTypeC,
 
  212                       const ViewTypeB& b_arg,
 
  213                       const ViewTypeC& c_arg) :
 
  214     A(A_arg), 
b(b_arg), 
c(c_arg), 
n(
A.extent(1))
 
  233 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__) 
  237         t[k] += 
A(i,j,k)*bv + av*
b(j,k);
 
  249 template <
typename ViewTypeA, 
typename ViewTypeB, 
typename ViewTypeC>
 
  254   Kokkos::parallel_for( A.extent(0), f );
 
  258 template <
typename ViewTypeA, 
typename ViewTypeB, 
typename ViewTypeC>
 
  263   Kokkos::parallel_for( A.extent(0), f );
 
  267 template <
int MaxP, 
typename ViewTypeA, 
typename ViewTypeB, 
typename ViewTypeC>
 
  272   Kokkos::parallel_for( A.extent(0), f );
 
  276 template <
int p, 
typename ViewTypeA, 
typename ViewTypeB, 
typename ViewTypeC>
 
  281   Kokkos::parallel_for( A.extent(0), f );
 
  284 template <
typename ViewTypeA, 
typename ViewTypeB, 
typename ViewTypeC>
 
  286 check_val(
const ViewTypeA& 
A, 
const ViewTypeB& b, 
const ViewTypeC& 
c)
 
  288   const double tol = 1.0e-14;
 
  289   typedef typename ViewTypeC::value_type value_type;
 
  290   typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
 
  291   Kokkos::deep_copy(h_c, c);
 
  292   const size_t m = A.extent(0);
 
  293   const size_t n = A.extent(1);
 
  294   for (
size_t i=0; i<m; ++i) {
 
  297       std::cout << 
"Comparison failed!  " << i << 
" : " << h_c(i) << 
" , " << t
 
  303 template <
typename ViewTypeA, 
typename ViewTypeB, 
typename ViewTypeC>
 
  307   const double tol = 1.0e-14;
 
  308   typedef typename ViewTypeC::value_type value_type;
 
  309   typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
 
  310   Kokkos::deep_copy(h_c, c);
 
  311   const size_t m = A.extent(0);
 
  312   const size_t n = A.extent(1);
 
  313   const size_t p = A.extent(2);
 
  314   for (
size_t i=0; i<m; ++i) {
 
  315     for (
size_t j=0; j<p; ++j) {
 
  316       value_type t = (j == p-1 ? n : 2*n);
 
  318         std::cout << 
"Comparison failed!  " << i << 
"," << j << 
" : " 
  319                   << h_c(i,j) << 
" , " << t << std::endl;
 
  331 template <
typename FadType, 
typename ... ViewArgs>
 
  333 do_time_fad(
const size_t m, 
const size_t n, 
const size_t p, 
const size_t nloop,
 
  336   typedef Kokkos::View<
FadType**, ViewArgs...> ViewTypeA;
 
  337   typedef Kokkos::View<FadType*,  ViewArgs...> ViewTypeB;
 
  338   typedef Kokkos::View<FadType*,  ViewArgs...> ViewTypeC;
 
  339   typedef typename ViewTypeA::execution_space execution_space;
 
  341   ViewTypeA 
A(
"A",m,n,p+1);
 
  342   ViewTypeB b(
"B",n,p+1);
 
  343   ViewTypeC 
c(
"c",m,p+1);
 
  346   for (
size_t k=0; k<p; ++k)
 
  347     a.fastAccessDx(k) = 1.0;
 
  348   Kokkos::deep_copy(A, a);
 
  349   Kokkos::deep_copy(b, a);
 
  351   Kokkos::Impl::Timer wall_clock;
 
  356   execution_space().fence();
 
  359   for (
size_t l=0; l<nloop; l++) {
 
  362   execution_space().fence();
 
  364   perf.
time = wall_clock.seconds() / nloop;
 
  365   perf.
flops = m*n*(2+4*p);
 
  369     typename ViewTypeA::array_type A_flat = 
A;
 
  370     typename ViewTypeB::array_type b_flat = b;
 
  371     typename ViewTypeC::array_type c_flat = 
c;
 
  378 template <
typename ... ViewArgs>
 
  381                  const size_t nloop, 
const bool check)
 
  383   typedef Kokkos::View<
double***, ViewArgs...> ViewTypeA;
 
  384   typedef Kokkos::View<
double**,  ViewArgs...> ViewTypeB;
 
  385   typedef Kokkos::View<
double**,  ViewArgs...> ViewTypeC;
 
  386   typedef typename ViewTypeA::execution_space execution_space;
 
  388   ViewTypeA 
A(
"A",m,n,p+1);
 
  389   ViewTypeB b(
"B",n,p+1);
 
  390   ViewTypeC 
c(
"c",m,p+1);
 
  392   Kokkos::deep_copy(A, 1.0);
 
  393   Kokkos::deep_copy(b, 1.0);
 
  395   Kokkos::Impl::Timer wall_clock;
 
  400   execution_space().fence();
 
  404   for (
size_t l=0; l<nloop; l++) {
 
  407   execution_space().fence();
 
  410   perf.
time = wall_clock.seconds() / nloop;
 
  411   perf.
flops = m*n*(2+4*p);
 
  420 template <
int MaxP, 
typename ... ViewArgs>
 
  423                     const size_t nloop, 
const bool check)
 
  425   typedef Kokkos::View<
double***, ViewArgs...> ViewTypeA;
 
  426   typedef Kokkos::View<
double**,  ViewArgs...> ViewTypeB;
 
  427   typedef Kokkos::View<
double**,  ViewArgs...> ViewTypeC;
 
  428   typedef typename ViewTypeA::execution_space execution_space;
 
  430   ViewTypeA 
A(
"A",m,n,p+1);
 
  431   ViewTypeB b(
"B",n,p+1);
 
  432   ViewTypeC 
c(
"c",m,p+1);
 
  434   Kokkos::deep_copy(A, 1.0);
 
  435   Kokkos::deep_copy(b, 1.0);
 
  437   Kokkos::Impl::Timer wall_clock;
 
  441   run_mat_vec_deriv_sl<MaxP>( 
A, b, 
c );
 
  442   execution_space().fence();
 
  446   for (
size_t l=0; l<nloop; l++) {
 
  447     run_mat_vec_deriv_sl<MaxP>( 
A, b, 
c );
 
  449   execution_space().fence();
 
  452   perf.
time = wall_clock.seconds() / nloop;
 
  453   perf.
flops = m*n*(2+4*p);
 
  462 template <
int p, 
typename ... ViewArgs>
 
  465                    const size_t nloop, 
const bool check)
 
  467   typedef Kokkos::View<
double**[p+1], ViewArgs...> ViewTypeA;
 
  468   typedef Kokkos::View<
double**,  ViewArgs...> ViewTypeB;
 
  469   typedef Kokkos::View<
double**,  ViewArgs...> ViewTypeC;
 
  470   typedef typename ViewTypeA::execution_space execution_space;
 
  472   ViewTypeA 
A(
"A",m,n,p+1);
 
  473   ViewTypeB b(
"B",n,p+1);
 
  474   ViewTypeC 
c(
"c",m,p+1);
 
  476   Kokkos::deep_copy(A, 1.0);
 
  477   Kokkos::deep_copy(b, 1.0);
 
  479   Kokkos::Impl::Timer wall_clock;
 
  483   run_mat_vec_deriv_s<p>( 
A, b, 
c );
 
  484   execution_space().fence();
 
  488   for (
size_t l=0; l<nloop; l++) {
 
  489     run_mat_vec_deriv_s<p>( 
A, b, 
c );
 
  491   execution_space().fence();
 
  494   perf.
time = wall_clock.seconds() / nloop;
 
  495   perf.
flops = m*n*(2+4*p);
 
  504 template <
typename ... ViewArgs>
 
  509   typedef Kokkos::View<
double**, ViewArgs...> ViewTypeA;
 
  510   typedef Kokkos::View<
double*,  ViewArgs...> ViewTypeB;
 
  511   typedef Kokkos::View<
double*,  ViewArgs...> ViewTypeC;
 
  512   typedef typename ViewTypeA::execution_space execution_space;
 
  514   ViewTypeA 
A(
"A",m,n);
 
  518   Kokkos::deep_copy(A, 1.0);
 
  519   Kokkos::deep_copy(b, 1.0);
 
  521   Kokkos::Impl::Timer wall_clock;
 
  526   execution_space().fence();
 
  529   for (
size_t l=0; l<nloop; l++) {
 
  532   execution_space().fence();
 
  534   perf.
time = wall_clock.seconds() / nloop;
 
  547   std::cout << name << 
"\t " 
  548             << perf.
time << 
"\t " 
  568   perf_analytic.
time = 1.0;
 
  582     print_perf(perf_analytic, perf_analytic, 
"Analytic  ");
 
  588     print_perf(perf, perf_analytic, 
"Analytic-s");
 
  594     print_perf(perf, perf_analytic, 
"Analytic-sl");
 
  600       do_time_fad<Sacado::Fad::SFad<double,SFadSize>, ViewArgs...>(m,n,p,nloop,check);
 
  607       do_time_fad<Sacado::Fad::SLFad<double,SLFadSize>, ViewArgs...>(m,n,p,nloop,check);
 
  614       do_time_fad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
 
  630 template <
int SFadSize, 
int SLFadSize, 
typename Device>
 
  643                 const std::string& device)
 
  646   std::cout.setf(std::ios::scientific);
 
  647   std::cout.precision(prec);
 
  648   std::cout << std::endl
 
  650             << 
" performance for layout " 
  652             << 
" m = " << m << 
" n = " << n << 
" p = " << p
 
  653             << std::endl << std::endl;
 
  654   std::cout << 
"Computation \t Time     \t Throughput \t Ratio" << std::endl;
 
  657     do_times<SFadSize,SLFadSize,Kokkos::LayoutLeft,Device>(
 
  658       m,n,p,nloop,value,analytic,sfad,slfad,dfad,check);
 
  660     do_times<SFadSize,SLFadSize,Kokkos::LayoutRight,Device>(
 
  661       m,n,p,nloop,value,analytic,sfad,slfad,dfad,check);
 
  663     do_times<SFadSize,SLFadSize,Device>
 
  664       (m,n,p,nloop,value,analytic,sfad,slfad,dfad,check);
 
  669   std::stringstream cmd;
 
  670   pid_t my_os_pid=getpid();
 
  671   const std::string vtune_loc =
 
  673   const std::string output_dir = 
"./vtune";
 
  675       << 
" -collect hotspots -result-dir " << output_dir
 
  676       << 
" -target-pid " << my_os_pid << 
" &";
 
  677   std::cout << cmd.str() << std::endl;
 
  678   system(cmd.str().c_str());
 
  685 int main(
int argc, 
char* argv[]) {
 
  691     clp.
setDocString(
"This program tests the speed of various forward mode AD implementations for simple Kokkos kernel");
 
  693     clp.
setOption(
"m", &m, 
"Number of matrix rows");
 
  695     clp.
setOption(
"n", &n, 
"Number of matrix columns");
 
  697     clp.
setOption(
"p", &p, 
"Number of derivative components");
 
  699     clp.
setOption(
"nloop", &nloop, 
"Number of loops");
 
  700 #ifdef KOKKOS_ENABLE_SERIAL 
  702     clp.
setOption(
"serial", 
"no-serial", &serial, 
"Whether to run Serial");
 
  704 #ifdef KOKKOS_ENABLE_OPENMP 
  706     clp.
setOption(
"openmp", &openmp, 
"Number of OpenMP threads");
 
  708 #ifdef KOKKOS_ENABLE_THREADS 
  710     clp.
setOption(
"threads", &threads, 
"Number of pThreads threads");
 
  712 #ifdef KOKKOS_ENABLE_CUDA 
  714     clp.
setOption(
"cuda", 
"no-cuda", &cuda, 
"Whether to run CUDA");
 
  718                   "Number of NUMA domains to use (set to 0 to use all NUMAs");
 
  719     int cores_per_numa = 0;
 
  720     clp.
setOption(
"cores-per-numa", &cores_per_numa,
 
  721                   "Number of CPU cores per NUMA to use (set to 0 to use all cores)");
 
  722     bool print_config = 
false;
 
  723     clp.
setOption(
"print-config", 
"no-print-config", &print_config,
 
  724                   "Whether to print Kokkos device configuration");
 
  729     clp.
setOption(
"vtune", 
"no-vtune", &vtune, 
"Profile with vtune");
 
  731     clp.
setOption(
"value", 
"no-value", &value, 
"Run value calculation");
 
  732     bool analytic = 
true;
 
  733     clp.
setOption(
"analytic", 
"no-analytic", &analytic,
 
  734                   "Run analytic derivative calculation");
 
  736     clp.
setOption(
"sfad", 
"no-sfad", &sfad, 
"Run SFad derivative calculation");
 
  738     clp.
setOption(
"slfad", 
"no-slfad", &slfad, 
"Run SLFad derivative calculation");
 
  739 #if defined(KOKKOS_ENABLE_CUDA_UVM) 
  741     clp.
setOption(
"dfad", 
"no-dfad", &dfad, 
"Run DFad derivative calculation");
 
  746     clp.
setOption(
"check", 
"no-check", &check, 
"Check calculations are correct");
 
  749     switch (clp.
parse(argc, argv)) {
 
  762     Kokkos::InitArguments init_args;
 
  763     init_args.num_threads = cores_per_numa;
 
  764     init_args.num_numa = numa;
 
  766     Kokkos::initialize(init_args);
 
  769       Kokkos::print_configuration(std::cout, 
true);
 
  771 #ifdef KOKKOS_ENABLE_SERIAL 
  773       do_times_layout<SFadSize,SLFadSize,Kokkos::Serial>(
 
  774         m,n,p,nloop,value,analytic,sfad,slfad,dfad,check,layout,
"Serial");
 
  778 #ifdef KOKKOS_ENABLE_OPENMP 
  780       do_times_layout<SFadSize,SLFadSize,Kokkos::OpenMP>(
 
  781         m,n,p,nloop,value,analytic,sfad,slfad,dfad,check,layout,
"OpenMP");
 
  785 #ifdef KOKKOS_ENABLE_THREADS 
  787       do_times_layout<SFadSize,SLFadSize,Kokkos::Threads>(
 
  788         m,n,p,nloop,value,analytic,sfad,slfad,dfad,check,layout,
"Threads");
 
  792 #ifdef KOKKOS_ENABLE_CUDA 
  794       do_times_layout<SFadSize,SLFadSize,Kokkos::Cuda>(
 
  795         m,n,p,nloop,value,analytic,sfad,slfad,dfad,check,layout,
"Cuda");
 
double do_time_analytic(int nderiv, int nloop)
const char * layout_names[]
void run_mat_vec(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void do_times(const T x[], int nloop, Teuchos::Array< double > ×)
void print_perf(const Perf &perf, const Perf &perf_base, const size_t p, const std::string &name)
Perf do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Sacado::Fad::DFad< double > FadType
KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const 
void do_times_layout(const size_t m, const size_t n, const size_t p, const size_t ph, const size_t nloop, const bool value, const bool sfad, const bool slfad, const bool dfad, const bool hierarchical, const bool check, const LayoutType &layout, const std::string &device)
void run_mat_vec_deriv_sl(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const 
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)
#define KOKKOS_INLINE_FUNCTION
ViewTypeC::value_type scalar_type
KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const 
void start(bool reset=false)
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
MatVecFunctor(const ViewTypeA &A_arg, const ViewTypeB &b_arg, const ViewTypeC &c_arg)
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
const LayoutType layout_values[]
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const 
Perf do_time_val(const size_t m, const size_t n, const size_t nloop, const bool check)
ViewTypeC::value_type scalar_type
MatVecDerivFunctor(const ViewTypeA &A_arg, const ViewTypeB &b_arg, const ViewTypeC &c_arg)
SLMatVecDerivFunctor(const ViewTypeA &A_arg, const ViewTypeB &b_arg, const ViewTypeC &c_arg)
SMatVecDerivFunctor(const ViewTypeA &A_arg, const ViewTypeB &b_arg, const ViewTypeC &c_arg)
Perf do_time_analytic_s(const size_t m, const size_t n, const size_t nloop, const bool check)
void setDocString(const char doc_string[])
void check_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
ViewTypeC::value_type scalar_type
KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const 
ViewTypeC::value_type scalar_type
ViewTypeC::execution_space execution_space
void run_mat_vec_deriv_s(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void check_val(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
ViewTypeC::execution_space execution_space
ViewTypeC::execution_space execution_space
const int num_layout_types
ViewTypeC::execution_space execution_space
Perf do_time_analytic_sl(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
void run_mat_vec_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)