3 #define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1 
    4 #define SACADO_KOKKOS_USE_MEMORY_POOL 1 
    5 #define SACADO_ALIGN_SFAD 1 
   12 #include "Kokkos_Core.hpp" 
   13 #include "Kokkos_MemoryPool.hpp" 
   14 #include "impl/Kokkos_Timer.hpp" 
   20 template<
typename FluxView, 
typename WgbView, 
typename SrcView,
 
   21          typename WbsView, 
typename ResidualView>
 
   24 template<
typename FluxView, 
typename WgbView, 
typename SrcView,
 
   25          typename WbsView, 
typename ResidualView>
 
   29                         const SrcView& src, 
const WbsView& bs,
 
   30                         const ResidualView& residual,
 
   31                         const typename FluxView::non_const_value_type& 
coeff);
 
   33 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD) 
   38 template<
typename FluxView, 
typename WgbView, 
typename SrcView,
 
   39          typename WbsView, 
typename ResidualView>
 
   43   typedef typename Kokkos::TeamPolicy<execution_space>::member_type 
team_handle;
 
   60                   const SrcView& src, 
const WbsView& bs,
 
   79   void operator() (
const size_t cell, 
const int basis)
 const {
 
   82       for (
int dim=0; dim<
num_dim; ++dim)
 
   83         value += 
flux_m_i(cell,qp,dim)*
wgb(cell,basis,qp,dim);
 
   91     for (
int basis=0; basis<
num_basis; ++basis) {
 
   96   template <
unsigned VS>
 
   99     const size_t cell = team.league_rank()*team.team_size() + team.team_rank();
 
  104   template <
unsigned VS>
 
  107     const size_t cell = team.league_rank();
 
  108     const int team_size = team.team_size();
 
  109     for (
int basis=team.team_rank(); basis<
num_basis; basis+=team_size)
 
  110       (*
this)(cell, basis);
 
  114 #elif defined(SACADO_VIEW_CUDA_HIERARCHICAL) 
  119 template<
typename FluxView, 
typename WgbView, 
typename SrcView,
 
  120          typename WbsView, 
typename ResidualView>
 
  122   typedef typename FluxView::non_const_value_type 
scalar_type;
 
  123   typedef typename Kokkos::ThreadLocalScalarType<FluxView>::type local_scalar_type;
 
  125   typedef typename Kokkos::TeamPolicy<execution_space>::member_type 
team_handle;
 
  126   enum { stride = Kokkos::ViewScalarStride<FluxView>::stride };
 
  138   template <
unsigned VS> 
struct HierarchicalFlatTag {};
 
  139   template <
unsigned VS> 
struct HierarchicalTeamTag {};
 
  143                   const SrcView& src, 
const WbsView& bs,
 
  162   void operator() (
const size_t cell, 
const int basis)
 const {
 
  163     local_scalar_type value(0),value2(0);
 
  164     local_scalar_type 
c = Sacado::partition_scalar<stride>(
coeff);
 
  166       for (
int dim=0; dim<
num_dim; ++dim)
 
  167         value += 
flux_m_i(cell,qp,dim)*
wgb(cell,basis,qp,dim);
 
  168       value2 += 
src_m_i(cell,qp)*
wbs(cell,basis,qp);
 
  175     for (
int basis=0; basis<
num_basis; ++basis) {
 
  180   template <
unsigned VS>
 
  183     const size_t cell = team.league_rank()*team.team_size() + team.team_rank();
 
  188   template <
unsigned VS>
 
  191     const size_t cell = team.league_rank();
 
  192     const int team_size = team.team_size();
 
  193     for (
int basis=team.team_rank(); basis<
num_basis; basis+=team_size)
 
  194       (*
this)(cell, basis);
 
  203 template<
typename FluxView, 
typename WgbView, 
typename SrcView,
 
  204          typename WbsView, 
typename ResidualView>
 
  206   typedef typename FluxView::non_const_value_type 
scalar_type;
 
  208   typedef typename Kokkos::TeamPolicy<execution_space>::member_type 
team_handle;
 
  219   template <
unsigned VS> 
struct HierarchicalFlatTag {};
 
  220   template <
unsigned VS> 
struct HierarchicalTeamTag {};
 
  221   template <
unsigned VS> 
struct PartitionedTag {};
 
  225                   const SrcView& src, 
const WbsView& bs,
 
  226                   const ResidualView& residual, 
const scalar_type& c) :
 
  235     num_points(
wgb.extent(2)),
 
  244   void operator() (
const size_t cell, 
const int basis)
 const {
 
  247       for (
int dim=0; dim<
num_dim; ++dim)
 
  248         value += 
flux_m_i(cell,qp,dim)*
wgb(cell,basis,qp,dim);
 
  249       value2 += 
src_m_i(cell,qp)*
wbs(cell,basis,qp);
 
  256     for (
int basis=0; basis<
num_basis; ++basis) {
 
  261   template <
unsigned VS>
 
  263   void operator() (
const PartitionedTag<VS>, 
const size_t cell, 
const int basis)
 const {
 
  269     const unsigned k = threadIdx.x;
 
  271     const unsigned k = 0;
 
  275     auto flux_part =  Kokkos::partition<VS>(
flux_m_i, k, VS);
 
  276     auto wgb_part =   Kokkos::partition<VS>(
wgb, k, VS);
 
  277     auto src_part =   Kokkos::partition<VS>(
src_m_i, k, VS);
 
  278     auto wbs_part =   Kokkos::partition<VS>(
wbs, k, VS);
 
  279     auto resid_part = Kokkos::partition<VS>(
residual_m_i, k, VS);
 
  280     auto coeff_part = Sacado::partition_scalar<VS>(
coeff);
 
  284                                                wbs_part, resid_part,
 
  286     kernel_part(cell, basis);
 
  289   template <
unsigned VS>
 
  292     const size_t cell = team.league_rank()*team.team_size() + team.team_rank();
 
  294       for (
int basis=0; basis<
num_basis; ++basis)
 
  295         (*
this)(PartitionedTag<VS>(), cell, basis);
 
  298   template <
unsigned VS>
 
  301     const size_t cell = team.league_rank();
 
  302     const int team_size = team.team_size();
 
  303     for (
int basis=team.team_rank(); basis<
num_basis; basis+=team_size)
 
  304       (*
this)(PartitionedTag<VS>(), cell, basis);
 
  310 template<
typename FluxView, 
typename WgbView, 
typename SrcView,
 
  311          typename WbsView, 
typename ResidualView>
 
  315                         const SrcView& src, 
const WbsView& bs,
 
  316                         const ResidualView& residual,
 
  317                         const typename FluxView::non_const_value_type& coeff)
 
  320   return kernel_type(flux,bg,src,bs,residual,coeff);
 
  323 template<
typename KernelType>
 
  325   typedef typename KernelType::execution_space execution_space;
 
  326   Kokkos::RangePolicy<execution_space> policy(0,kernel.num_cells());
 
  327   Kokkos::parallel_for(policy, kernel);
 
  330 template<
typename KernelType>
 
  332   typedef typename KernelType::execution_space execution_space;
 
  333 #if defined (KOKKOS_ENABLE_CUDA) 
  334   const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
 
  336   const bool is_cuda = 
false;
 
  338   const unsigned vector_size = is_cuda ? 32 : 1;
 
  340     const unsigned team_size = 256 / vector_size;
 
  341     typedef typename KernelType::template HierarchicalFlatTag<vector_size> tag_type;
 
  342     typedef Kokkos::TeamPolicy<execution_space,tag_type> policy_type;
 
  343     const size_t range = (kernel.num_cells()+team_size-1)/team_size;
 
  344     policy_type policy(range,team_size,vector_size);
 
  345     Kokkos::parallel_for(policy, kernel);
 
  352 template<
typename KernelType>
 
  354   typedef typename KernelType::execution_space execution_space;
 
  355 #if defined (KOKKOS_ENABLE_CUDA) 
  356   const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
 
  358   const bool is_cuda = 
false;
 
  360   const unsigned vector_size = is_cuda ? 32 : 1;
 
  362     const unsigned team_size = 256 / vector_size;
 
  363     typedef typename KernelType::template HierarchicalTeamTag<vector_size> tag_type;
 
  364     typedef Kokkos::TeamPolicy<execution_space,tag_type> policy_type;
 
  365     policy_type policy(kernel.num_cells(),team_size,vector_size);
 
  366     Kokkos::parallel_for(policy, kernel);
 
  376     return std::string(
"dfad")
 
  377 #if defined(SACADO_KOKKOS_USE_MEMORY_POOL) 
  378       + std::string(
", mempool")
 
  380 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD_STRIDED) 
  381       + std::string(
", strided")
 
  388 #if defined(SACADO_ALIGN_SFAD) 
  389     return "sfad, aligned";
 
  396   static std::string 
eval() { 
return "slfad"; }
 
  399 template<
typename ExecSpace, 
int DIM, 
int N>
 
  411   typedef Kokkos::View<double****[N+1],ExecSpace> 
t_4DView;
 
  412   typedef Kokkos::View<double***[N+1],ExecSpace> 
t_3DView;
 
  413   typedef Kokkos::View<double**[N+1],ExecSpace> 
t_2DView;
 
  415   typedef Kokkos::View<const double***[N+1],ExecSpace,Kokkos::MemoryTraits<Kokkos::RandomAccess> > 
t_3DView_const;
 
  417 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD) 
  427 #if defined(KOKKOS_ENABLE_CUDA) 
  429     std::is_same< ExecSpace, Kokkos::Cuda >::value ? 32 : 1;
 
  430 #if defined(SACADO_ALIGN_SFAD) 
  432   typedef typename FadType::template apply_N<Nalign>::type 
AlignedFadType;
 
  445   typedef typename Kokkos::TeamPolicy<ExecSpace>::member_type 
team_handle;
 
  447   typedef Kokkos::View<double****[N+1],Kokkos::LayoutRight,ExecSpace> 
t_4DView_team;
 
  448   typedef Kokkos::View<double***[N+1],Kokkos::LayoutRight,ExecSpace> 
t_3DView_team;
 
  449   typedef Kokkos::View<double**[N+1],Kokkos::LayoutRight,ExecSpace> 
t_2DView_team;
 
  450   typedef Kokkos::View<const double***[N+1],Kokkos::LayoutRight,ExecSpace,Kokkos::MemoryTraits<Kokkos::RandomAccess> > 
t_3DView_const_team;
 
  453   typedef Kokkos::View<double[N+1],typename ExecSpace::scratch_memory_space,Kokkos::MemoryTraits<Kokkos::Unmanaged> > 
t_shared_scalar;
 
  477     num_basis(num_basis_) ,
 
  478     num_points(num_points_) ,
 
  513     for (
int i=0; i<N; ++i)
 
  518   typename FadType::value_type
 
  520                 const size_t n2, 
const size_t n3, 
const int fad_size,
 
  521                 const size_t i0, 
const size_t i1,
 
  522                 const size_t i2, 
const size_t i3,
 
  525     typedef typename FadType::value_type scalar;
 
  526     const scalar x0 =    10.0 + scalar(n0) / scalar(i0+1);
 
  527     const scalar x1 =   100.0 + scalar(n1) / scalar(i1+1);
 
  528     const scalar x2 =  1000.0 + scalar(n2) / scalar(i2+1);
 
  529     const scalar x3 = 10000.0 + scalar(n3) / scalar(i3+1);
 
  530     const scalar x  = x0 + x1 + x2 + x3;
 
  531     if (i_fad == fad_size)
 
  533     const scalar x_fad = 1.0 + scalar(fad_size) / scalar(i_fad+1);
 
  537   template <
typename V1, 
typename V2, 
typename V3, 
typename V4, 
typename V5>
 
  538   void init_fad(
const V1& v1, 
const V2& v2, 
const V3& v3, 
const V4& v4, 
const V5& v5)
 
  545     auto v1_h = Kokkos::create_mirror_view(v1);
 
  546     auto v2_h = Kokkos::create_mirror_view(v2);
 
  547     auto v3_h = Kokkos::create_mirror_view(v3);
 
  548     auto v4_h = Kokkos::create_mirror_view(v4);
 
  549     for (
int cell=0; cell<
ncells; ++cell) {
 
  550       for (
int basis=0; basis<
num_basis; ++basis) {
 
  552           for (
int dim=0; dim<
ndim; ++dim) {
 
  553             for (
int i=0; i<N; ++i)
 
  554               v1_h(cell,basis,qp,dim).fastAccessDx(i) =
 
  555                 generate_fad(ncells,num_basis,num_points,ndim,N,cell,basis,qp,dim,i);
 
  556             v1_h(cell,basis,qp,dim).val() =
 
  557               generate_fad(ncells,num_basis,num_points,ndim,N,cell,basis,qp,dim,N);
 
  559           for (
int i=0; i<N; ++i)
 
  560             v2_h(cell,basis,qp).fastAccessDx(i) =
 
  561               generate_fad(ncells,num_basis,num_points,1,N,cell,basis,qp,0,i);
 
  562           v2_h(cell,basis,qp).val() =
 
  563               generate_fad(ncells,num_basis,num_points,1,N,cell,basis,qp,0,N);
 
  567         for (
int dim=0; dim<
ndim; ++dim) {
 
  568           for (
int i=0; i<N; ++i)
 
  569             v3_h(cell,qp,dim).fastAccessDx(i) =
 
  570               generate_fad(ncells,1,num_points,ndim,N,cell,0,qp,dim,i);
 
  571           v3_h(cell,qp,dim).val() =
 
  572               generate_fad(ncells,1,num_points,ndim,N,cell,0,qp,dim,N);
 
  574         for (
int i=0; i<N; ++i)
 
  575           v4_h(cell,qp).fastAccessDx(i) =
 
  577         v4_h(cell,qp).val() =
 
  582     Kokkos::deep_copy( v1, v1_h );
 
  583     Kokkos::deep_copy( v2, v2_h );
 
  584     Kokkos::deep_copy( v3, v3_h );
 
  585     Kokkos::deep_copy( v4, v4_h );
 
  587     Kokkos::deep_copy(
typename V5::array_type(v5), 0.0);
 
  590   template <
typename V1, 
typename V2, 
typename V3, 
typename V4, 
typename V5>
 
  591   void init_array(
const V1& v1, 
const V2& v2, 
const V3& v3, 
const V4& v4, 
const V5& v5)
 
  598     auto v1_h = Kokkos::create_mirror_view(v1);
 
  599     auto v2_h = Kokkos::create_mirror_view(v2);
 
  600     auto v3_h = Kokkos::create_mirror_view(v3);
 
  601     auto v4_h = Kokkos::create_mirror_view(v4);
 
  602     for (
int cell=0; cell<
ncells; ++cell) {
 
  603       for (
int basis=0; basis<
num_basis; ++basis) {
 
  605           for (
int dim=0; dim<
ndim; ++dim) {
 
  606             for (
int i=0; i<N; ++i)
 
  607               v1_h(cell,basis,qp,dim,i) =
 
  608                 generate_fad(ncells,num_basis,num_points,ndim,N,cell,basis,qp,dim,i);
 
  609             v1_h(cell,basis,qp,dim,N) =
 
  610               generate_fad(ncells,num_basis,num_points,ndim,N,cell,basis,qp,dim,N);
 
  612           for (
int i=0; i<N; ++i)
 
  613             v2_h(cell,basis,qp,i) =
 
  614               generate_fad(ncells,num_basis,num_points,1,N,cell,basis,qp,0,i);
 
  615           v2_h(cell,basis,qp,N) =
 
  616             generate_fad(ncells,num_basis,num_points,1,N,cell,basis,qp,0,N);
 
  620         for (
int dim=0; dim<
ndim; ++dim) {
 
  621           for (
int i=0; i<N; ++i)
 
  622             v3_h(cell,qp,dim,i) =
 
  623               generate_fad(ncells,1,num_points,ndim,N,cell,0,qp,dim,i);
 
  624           v3_h(cell,qp,dim,N) =
 
  625             generate_fad(ncells,1,num_points,ndim,N,cell,0,qp,dim,N);
 
  627         for (
int i=0; i<N; ++i)
 
  635     Kokkos::deep_copy( v1, v1_h );
 
  636     Kokkos::deep_copy( v2, v2_h );
 
  637     Kokkos::deep_copy( v3, v3_h );
 
  638     Kokkos::deep_copy( v4, v4_h );
 
  640     Kokkos::deep_copy(
typename V5::array_type(v5), 0.0);
 
  643   template <
typename View1, 
typename View2>
 
  644   typename std::enable_if< !Kokkos::is_view_fad<View2>::value, 
bool>::type
 
  645   check(
const View1& v_gold, 
const View2& v, 
const double tol)
 
  648     typename View1::HostMirror v_gold_h = Kokkos::create_mirror_view(v_gold);
 
  649     typename View2::HostMirror v_h      = Kokkos::create_mirror_view(v);
 
  650     Kokkos::deep_copy(v_gold_h, v_gold);
 
  651     Kokkos::deep_copy(v_h, v);
 
  653     typedef typename View1::value_type value_type;
 
  655     const size_t n0 = v_gold_h.extent(0);
 
  656     const size_t n1 = v_gold_h.extent(1);
 
  657     const size_t n2 = v_gold_h.extent(2);
 
  660     for ( 
size_t i0 = 0 ; i0 < n0 ; ++i0 ) {
 
  661       for ( 
size_t i1 = 0 ; i1 < n1 ; ++i1 ) {
 
  662         for ( 
size_t i2 = 0 ; i2 < n2 ; ++i2 ) {
 
  663           value_type x_gold = v_gold_h(i0,i1,i2);
 
  664           value_type x      = v_h(i0,i1,i2);
 
  666             std::cout << 
"Comparison failed!  x_gold(" 
  667                       << i0 << 
"," << i1 << 
"," << i2 << 
") = " 
  668                       << x_gold << 
" , x = " << x
 
  679   template <
typename View1, 
typename View2>
 
  680   typename std::enable_if< Kokkos::is_view_fad<View2>::value, 
bool>::type
 
  681   check(
const View1& v_gold, 
const View2& v, 
const double tol)
 
  684     typename View1::HostMirror v_gold_h = Kokkos::create_mirror_view(v_gold);
 
  685     typename View2::HostMirror v_h      = Kokkos::create_mirror_view(v);
 
  686     Kokkos::deep_copy(v_gold_h, v_gold);
 
  687     Kokkos::deep_copy(v_h, v);
 
  689     typedef typename View1::value_type value_type;
 
  691     const size_t n0 = v_gold_h.extent(0);
 
  692     const size_t n1 = v_gold_h.extent(1);
 
  693     const size_t n2 = v_gold_h.extent(2);
 
  696     for ( 
size_t i0 = 0 ; i0 < n0 ; ++i0 ) {
 
  697       for ( 
size_t i1 = 0 ; i1 < n1 ; ++i1 ) {
 
  698         for ( 
size_t i2 = 0 ; i2 < n2 ; ++i2 ) {
 
  699           value_type x_gold = v_gold_h(i0,i1,i2);
 
  700           value_type x      = (i2 == n2-1) ? v_h(i0,i1).val() : v_h(i0,i1).dx(i2);
 
  702             std::cout << 
"Comparison failed!  x_gold(" 
  703                       << i0 << 
"," << i1 << 
"," << i2 << 
") = " 
  704                       << x_gold << 
" , x = " << x
 
  717     for (
int basis=0; basis<
num_basis; ++basis) {
 
  718       double value[N+1],value2[N+1];
 
  719       for (
int k=0; k<N+1; ++k) {
 
  724         for (
int dim=0; dim<DIM; ++dim) {
 
  725           const double flux_val = 
flux_m_i(cell,qp,dim,N);
 
  726           const double wgb_val = 
wgb(cell,basis,qp,dim,N);
 
  727           value[N] += flux_val*wgb_val;
 
  728           for(
int k = 0; k<N;k++)
 
  729             value[k] += flux_val*
wgb(cell,basis,qp,dim,k)+
flux_m_i(cell,qp,dim,k)*wgb_val;
 
  731         const double src_val = 
src_m_i(cell,qp,N);
 
  732         const double wbs_val = 
wbs(cell,basis,qp,N);
 
  733         value2[N] += src_val*wbs_val;
 
  734         for(
int k = 0; k<N;k++)
 
  735           value2[k] += src_val*
wbs(cell,basis,qp,k)+
src_m_i(cell,qp,k)*wbs_val;
 
  737       for(
int k = 0; k<N; k++)
 
  739           coeff.val()*(value[k]+value2[k]) +
 
  740           coeff.fastAccessDx(k)*(value[N]+value2[N]);
 
  747     for (
int basis=0; basis<
num_basis; ++basis) {
 
  748       double value[N+1],value2[N+1];
 
  749       for (
int k=0; k<N+1; ++k) {
 
  754         for (
int dim=0; dim<DIM; ++dim) {
 
  755           const double flux_val = 
flux_m_i(cell,qp,dim,N);
 
  756           const double wgb_val = 
wgb(cell,basis,qp,dim,N);
 
  757           value[N] += flux_val*wgb_val;
 
  758           for(
int k = 0; k<N;k++)
 
  759             value[k] += flux_val*
wgb(cell,basis,qp,dim,k)+
flux_m_i_const(cell,qp,dim,k)*wgb_val;
 
  761         const double src_val = 
src_m_i(cell,qp,N);
 
  762         const double wbs_val = 
wbs(cell,basis,qp,N);
 
  763         value2[N] += src_val*wbs_val;
 
  764         for(
int k = 0; k<N;k++)
 
  765           value2[k] += src_val*
wbs(cell,basis,qp,k)+
src_m_i(cell,qp,k)*wbs_val;
 
  767       for(
int k = 0; k<N; k++)
 
  769           coeff.val()*(value[k]+value2[k]) +
 
  770           coeff.fastAccessDx(k)*(value[N]+value2[N]);
 
  780       for (
int dim=0; dim<DIM; ++dim) {
 
  781         const double flux_val = 
tflux_m_i(cell,qp,dim,N);
 
  782         const double wgb_val = 
twgb(cell,basis,qp,dim,N);
 
  783         Kokkos::single(Kokkos::PerThread(team), [&] () {
 
  784             value[N] += flux_val*wgb_val;
 
  786         Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,N), [&] (
const int& k) {
 
  790       const double src_val = 
tsrc_m_i(cell,qp,N);
 
  791       const double wbs_val = 
twbs(cell,basis,qp,N);
 
  792       Kokkos::single(Kokkos::PerThread(team), [&] () {
 
  793           value2[N] += src_val*wbs_val;
 
  795       Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,N), [&] (
const int& k) {
 
  796           value2[k] += src_val*
twbs(cell,basis,qp,k)+
tsrc_m_i(cell,qp,k)*wbs_val;
 
  799     Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,N), [&] (
const int& k) {
 
  801           coeff.val()*(value[k]+value2[k]) +
 
  802           coeff.fastAccessDx(k)*(value[N]+value2[N]);
 
  804     Kokkos::single(Kokkos::PerThread(team), [&] () {
 
  814     const int cell = team.league_rank();
 
  815     Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,num_basis), [&] (
const int& basis) {
 
  816         Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,N+1), [&] (
const int& k) {
 
  824   void compute(
const int ntrial, 
const bool do_check) {
 
  827     Kokkos::Impl::Timer timer;
 
  828     for (
int i=0; i<ntrial; ++i) {
 
  832     double time_fad = timer.seconds() / ntrial / 
ncells;
 
  837     for (
int i=0; i<ntrial; ++i) {
 
  841     double time_fad_cont = timer.seconds() / ntrial / 
ncells;
 
  844     for (
int i=0; i<ntrial; ++i)
 
  845       Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace,MomFluxTag>(0,
ncells), *
this);
 
  847     double time = timer.seconds() / ntrial / 
ncells;
 
  850     for (
int i=0; i<ntrial; ++i)
 
  851       Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace,MomFluxTagConst>(0,
ncells), *
this);
 
  853     double time_const = timer.seconds() / ntrial / 
ncells;
 
  856     for (
int i=0; i<ntrial; ++i)
 
  857       Kokkos::parallel_for(Kokkos::TeamPolicy<ExecSpace,MomFluxTagConstTeam>(
ncells,num_basis,32).set_scratch_size(0,Kokkos::PerThread(64*8*2)), *
this);
 
  859     double time_team = timer.seconds() / ntrial / 
ncells;
 
  861     printf(
"%5d   %9.3e   %9.3e   %9.3e   %9.3e   %9.3e\n",
ncells,time_fad,time_fad_cont,time,time_const,time_team);
 
  864       const double tol = 1e-14;
 
  873 template <
typename ExecSpace>
 
  874 void run(
const int cell_begin, 
const int cell_end, 
const int cell_step,
 
  875          const int nbasis, 
const int npoint, 
const int ntrial, 
const bool check)
 
  877   const int fad_dim = 50;
 
  885   std::cout << 
"concurrency = " << ExecSpace::concurrency() << std::endl;
 
  886   const size_t block_size = fad_dim*
sizeof(double);
 
  887   size_t nkernels = ExecSpace::concurrency()*2;
 
  888 #if defined(KOKKOS_ENABLE_CUDA) 
  889   if (std::is_same<ExecSpace, Kokkos::Cuda>::value)
 
  892   size_t mem_pool_size =
 
  893     static_cast<size_t>(1.2*nkernels*block_size);
 
  894   const size_t superblock_size = std::max<size_t>(nkernels / 100, 1) * block_size;
 
  895     std::cout << 
"Memory pool size = " << mem_pool_size / (1024.0 * 1024.0)
 
  896               << 
" MB" << std::endl;
 
  897     ExecSpace exec_space;
 
  899         block_size, block_size, superblock_size);
 
  901 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL) 
  902   std::cout << 
"hierarchical";
 
  903 #elif defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD) 
  904   std::cout << 
"hierarchical_dfad";
 
  906   std::cout << 
"partitioned";
 
  911   printf(
"ncell      flat        hier      analytic     const        team\n");
 
  912   for(
int i=cell_begin; i<=cell_end; i+=cell_step) {
 
  913     test_type 
test(i,nbasis,npoint);
 
  914     test.compute(ntrial, check);
 
  920 int main(
int argc, 
char* argv[]) {
 
  926     clp.
setDocString(
"This program tests the speed of various forward mode AD implementations for simple Kokkos kernel");
 
  927 #ifdef KOKKOS_ENABLE_SERIAL 
  929     clp.
setOption(
"serial", 
"no-serial", &serial, 
"Whether to run Serial");
 
  931 #ifdef KOKKOS_ENABLE_OPENMP 
  933     clp.
setOption(
"openmp", &openmp, 
"Number of OpenMP threads");
 
  935 #ifdef KOKKOS_ENABLE_THREADS 
  937     clp.
setOption(
"threads", &threads, 
"Number of pThreads threads");
 
  939 #ifdef KOKKOS_ENABLE_CUDA 
  941     clp.
setOption(
"cuda", 
"no-cuda", &cuda, 
"Whether to run CUDA");
 
  945                   "Number of NUMA domains to use (set to 0 to use all NUMAs");
 
  946     int cores_per_numa = 0;
 
  947     clp.
setOption(
"cores-per-numa", &cores_per_numa,
 
  948                   "Number of CPU cores per NUMA to use (set to 0 to use all cores)");
 
  949     bool print_config = 
false;
 
  950     clp.
setOption(
"print-config", 
"no-print-config", &print_config,
 
  951                   "Whether to print Kokkos device configuration");
 
  952     int cell_begin = 100;
 
  953     clp.
setOption(
"begin", &cell_begin, 
"Starting number of cells");
 
  955     clp.
setOption(
"end", &cell_end, 
"Ending number of cells");
 
  957     clp.
setOption(
"step", &cell_step, 
"Cell increment");
 
  959     clp.
setOption(
"basis", &nbasis, 
"Number of basis functions");
 
  961     clp.
setOption(
"point", &npoint, 
"Number of integration points");
 
  963     clp.
setOption(
"trial", &ntrial, 
"Number of trials");
 
  965     clp.
setOption(
"check", 
"no-check", &check,
 
  966                   "Check correctness of results");
 
  969     switch (clp.
parse(argc, argv)) {
 
  979     Kokkos::InitArguments init_args;
 
  980     init_args.num_threads = -1;
 
  981     #ifdef KOKKOS_ENABLE_OPENMP 
  982       if(openmp) init_args.num_threads = openmp;
 
  984     #ifdef KOKKOS_ENABLE_THREADS 
  985       if(threads) init_args.num_threads = threads;
 
  988     Kokkos::initialize(init_args);
 
  990       Kokkos::print_configuration(std::cout, 
true);
 
  992 #ifdef KOKKOS_ENABLE_SERIAL 
  994       using Kokkos::Serial;
 
  995       run<Serial>(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check);
 
  999 #ifdef KOKKOS_ENABLE_OPENMP 
 1001       using Kokkos::OpenMP;
 
 1002       run<OpenMP>(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check);
 
 1006 #ifdef KOKKOS_ENABLE_THREADS 
 1008       using Kokkos::Threads;
 
 1009       run<Threads>(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check);
 
 1013 #ifdef KOKKOS_ENABLE_CUDA 
 1016       run<Cuda>(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check);
 
static const int FadStride
Kokkos::TeamPolicy< ExecSpace >::member_type team_handle
FluxView::non_const_value_type scalar_type
t_3DViewFadCont wbs_fad_cont
KOKKOS_INLINE_FUNCTION size_t num_cells() const 
Kokkos::View< AlignedFadType **, ContLayout, ExecSpace > t_2DViewFadCont
void createGlobalMemoryPool(const ExecSpace &space, const size_t min_total_alloc_size, const uint32_t min_block_alloc_size, const uint32_t max_block_alloc_size, const uint32_t min_superblock_size)
void run_hierarchical_team(const KernelType &kernel)
t_3DViewFadCont flux_m_i_fad_cont
DrekarTest(int ncells_, int num_basis_, int num_points_)
GeneralFad< StaticStorage< T, Num > > SLFad
KOKKOS_INLINE_FUNCTION AdvectionKernel< FluxView, WgbView, SrcView, WbsView, ResidualView > create_advection_kernel(const FluxView &flux, const WgbView &bg, const SrcView &src, const WbsView &bs, const ResidualView &residual, const typename FluxView::non_const_value_type &coeff)
void run(const int cell_begin, const int cell_end, const int cell_step, const int nbasis, const int npoint, const int ntrial, const bool check)
Kokkos::View< double **[N+1], ExecSpace > t_2DView
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)
t_3DView_const_team tflux_m_i_const
Kokkos::View< AlignedFadType ***, ContLayout, ExecSpace > t_3DViewFadCont
Kokkos::View< double **[N+1], Kokkos::LayoutRight, ExecSpace > t_2DView_team
t_4DViewFadCont wgb_fad_cont
KOKKOS_INLINE_FUNCTION void operator()(const MomFluxTag, const std::size_t &cell) const 
#define KOKKOS_INLINE_FUNCTION
t_2DView_team tresidual_m_i
Sacado::Fad::DFad< double > FadType
void compute(const int ntrial, const bool do_check)
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
FadType::value_type generate_fad(const size_t n0, const size_t n1, const size_t n2, const size_t n3, const int fad_size, const size_t i0, const size_t i1, const size_t i2, const size_t i3, const int i_fad)
std::enable_if< Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
GeneralFad< DynamicStorage< T > > DFad
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
const ResidualView residual_m_i
static std::string eval()
void init_array(const V1 &v1, const V2 &v2, const V3 &v3, const V4 &v4, const V5 &v5)
FluxView::execution_space execution_space
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
static std::string eval()
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const 
Kokkos::View< AlignedFadType ****, ContLayout, ExecSpace > t_4DViewFadCont
void run_flat(const KernelType &kernel)
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
KOKKOS_INLINE_FUNCTION void operator()(const size_t cell, const int basis) const 
Kokkos::View< double ***[N+1], Kokkos::LayoutRight, ExecSpace > t_3DView_team
Kokkos::TeamPolicy< execution_space >::member_type team_handle
void init_fad(const V1 &v1, const V2 &v2, const V3 &v3, const V4 &v4, const V5 &v5)
t_2DViewFadCont residual_m_i_fad_cont
void run_hierarchical_flat(const KernelType &kernel)
t_2DView residual_m_i_const
void setDocString(const char doc_string[])
Kokkos::View< double[N+1], typename ExecSpace::scratch_memory_space, Kokkos::MemoryTraits< Kokkos::Unmanaged > > t_shared_scalar
t_3DView_const flux_m_i_const
Kokkos::View< FadType ****, ExecSpace > t_4DViewFad
Kokkos::View< FadType ***, ExecSpace > t_3DViewFad
ExecSpace::array_layout DefaultLayout
Kokkos::LayoutContiguous< DefaultLayout, FadStride > ContLayout
static std::string eval()
Kokkos::View< const double ***[N+1], ExecSpace, Kokkos::MemoryTraits< Kokkos::RandomAccess > > t_3DView_const
void destroyGlobalMemoryPool(const ExecSpace &space)
KOKKOS_INLINE_FUNCTION void compute_one(const MomFluxTagConstTeam, const team_handle &team, const int &cell, const int &basis, const t_shared_scalar &value, const t_shared_scalar &value2) const 
Kokkos::View< double ***[N+1], ExecSpace > t_3DView
Kokkos::View< const double ***[N+1], Kokkos::LayoutRight, ExecSpace, Kokkos::MemoryTraits< Kokkos::RandomAccess > > t_3DView_const_team
t_2DViewFadCont src_m_i_fad_cont
t_2DViewFad residual_m_i_fad
Kokkos::View< double ****[N+1], Kokkos::LayoutRight, ExecSpace > t_4DView_team
GeneralFad< StaticFixedStorage< T, Num > > SFad
Kokkos::View< double ****[N+1], ExecSpace > t_4DView
KOKKOS_INLINE_FUNCTION AdvectionKernel(const FluxView &flux, const WgbView &gb, const SrcView &src, const WbsView &bs, const ResidualView &residual, const scalar_type &c)
Kokkos::View< FadType **, ExecSpace > t_2DViewFad