17 #ifndef KOKKOS_IMPL_PUBLIC_INCLUDE 
   18 #include <Kokkos_Macros.hpp> 
   20               "Including non-public Kokkos header files is not allowed.");
 
   22 #ifndef KOKKOS_MEMORYPOOL_HPP 
   23 #define KOKKOS_MEMORYPOOL_HPP 
   25 #include <Kokkos_Core_fwd.hpp> 
   28 #include <impl/Kokkos_ConcurrentBitset.hpp> 
   29 #include <impl/Kokkos_Error.hpp> 
   30 #include <impl/Kokkos_SharedAlloc.hpp> 
   43 void memory_pool_bounds_verification(
size_t min_block_alloc_size,
 
   44                                      size_t max_block_alloc_size,
 
   45                                      size_t min_superblock_size,
 
   46                                      size_t max_superblock_size,
 
   47                                      size_t max_block_per_superblock,
 
   48                                      size_t min_total_alloc_size);
 
   56 void _print_memory_pool_state(std::ostream &s, uint32_t 
const *sb_state_ptr,
 
   57                               int32_t sb_count, uint32_t sb_size_lg2,
 
   58                               uint32_t sb_state_size, uint32_t state_shift,
 
   59                               uint32_t state_used_mask);
 
   63 template <
typename DeviceType>
 
   66   using CB = Kokkos::Impl::concurrent_bitset;
 
   68   enum : uint32_t { bits_per_int_lg2 = CB::bits_per_int_lg2 };
 
   69   enum : uint32_t { state_shift = CB::state_shift };
 
   70   enum : uint32_t { state_used_mask = CB::state_used_mask };
 
   71   enum : uint32_t { state_header_mask = CB::state_header_mask };
 
   72   enum : uint32_t { max_bit_count_lg2 = CB::max_bit_count_lg2 };
 
   73   enum : uint32_t { max_bit_count = CB::max_bit_count };
 
   75   enum : uint32_t { HINT_PER_BLOCK_SIZE = 2 };
 
   77   static KOKKOS_FUNCTION 
unsigned integral_power_of_two_that_contains(
 
   79     return N ? Kokkos::bit_width(N - 1) : 0;
 
  102   using base_memory_space = 
typename DeviceType::memory_space;
 
  106                                                  base_memory_space>::accessible
 
  109   using Tracker = Kokkos::Impl::SharedAllocationTracker;
 
  110   using Record  = Kokkos::Impl::SharedAllocationRecord<base_memory_space>;
 
  113   uint32_t *m_sb_state_array;
 
  114   uint32_t m_sb_state_size;
 
  115   uint32_t m_sb_size_lg2;
 
  116   uint32_t m_max_block_size_lg2;
 
  117   uint32_t m_min_block_size_lg2;
 
  119   int32_t m_hint_offset;  
 
  120   int32_t m_data_offset;  
 
  121   int32_t m_unused_padding;
 
  124   using memory_space = 
typename DeviceType::memory_space;
 
  127   enum : uint32_t { max_superblock_size = 1LU << 31  };
 
  128   enum : uint32_t { max_block_per_superblock = max_bit_count };
 
  132   KOKKOS_INLINE_FUNCTION
 
  133   bool operator==(MemoryPool 
const &other)
 const {
 
  134     return m_sb_state_array == other.m_sb_state_array;
 
  137   KOKKOS_INLINE_FUNCTION
 
  138   size_t capacity() const noexcept {
 
  139     return size_t(m_sb_count) << m_sb_size_lg2;
 
  142   KOKKOS_INLINE_FUNCTION
 
  143   size_t min_block_size() const noexcept {
 
  144     return (1LU << m_min_block_size_lg2);
 
  147   KOKKOS_INLINE_FUNCTION
 
  148   size_t max_block_size() const noexcept {
 
  149     return (1LU << m_max_block_size_lg2);
 
  152   struct usage_statistics {
 
  153     size_t capacity_bytes;        
 
  154     size_t superblock_bytes;      
 
  155     size_t max_block_bytes;       
 
  156     size_t min_block_bytes;       
 
  157     size_t capacity_superblocks;  
 
  158     size_t consumed_superblocks;  
 
  159     size_t consumed_blocks;       
 
  160     size_t consumed_bytes;        
 
  161     size_t reserved_blocks;  
 
  162     size_t reserved_bytes;   
 
  167   template <
typename ExecutionSpace = Kokkos::DefaultHostExecutionSpace>
 
  168   void get_usage_statistics(usage_statistics &stats)
 const {
 
  171         std::is_same_v<ExecutionSpace, Kokkos::DefaultHostExecutionSpace>);
 
  173     const size_t alloc_size = m_hint_offset * 
sizeof(uint32_t);
 
  175     uint32_t *
const sb_state_array =
 
  176         accessible ? m_sb_state_array : (uint32_t *)host.
allocate(alloc_size);
 
  179       Kokkos::Impl::DeepCopy<Kokkos::HostSpace, base_memory_space>(
 
  180           ExecutionSpace{}, sb_state_array, m_sb_state_array, alloc_size);
 
  182           "MemoryPool::get_usage_statistics(): fence after copying state " 
  183           "array to HostSpace");
 
  186     stats.superblock_bytes     = (1LU << m_sb_size_lg2);
 
  187     stats.max_block_bytes      = (1LU << m_max_block_size_lg2);
 
  188     stats.min_block_bytes      = (1LU << m_min_block_size_lg2);
 
  189     stats.capacity_bytes       = stats.superblock_bytes * m_sb_count;
 
  190     stats.capacity_superblocks = m_sb_count;
 
  191     stats.consumed_superblocks = 0;
 
  192     stats.consumed_blocks      = 0;
 
  193     stats.consumed_bytes       = 0;
 
  194     stats.reserved_blocks      = 0;
 
  195     stats.reserved_bytes       = 0;
 
  197     const uint32_t *sb_state_ptr = sb_state_array;
 
  199     for (int32_t i = 0; i < m_sb_count; ++i, sb_state_ptr += m_sb_state_size) {
 
  200       const uint32_t block_count_lg2 = (*sb_state_ptr) >> state_shift;
 
  202       if (block_count_lg2) {
 
  203         const uint32_t block_count    = 1u << block_count_lg2;
 
  204         const uint32_t block_size_lg2 = m_sb_size_lg2 - block_count_lg2;
 
  205         const uint32_t block_size     = 1u << block_size_lg2;
 
  206         const uint32_t block_used     = (*sb_state_ptr) & state_used_mask;
 
  208         stats.consumed_superblocks++;
 
  209         stats.consumed_blocks += block_used;
 
  210         stats.consumed_bytes += 
static_cast<size_t>(block_used) * block_size;
 
  211         stats.reserved_blocks += block_count - block_used;
 
  212         stats.reserved_bytes +=
 
  213             static_cast<size_t>(block_count - block_used) * block_size;
 
  224   template <
typename ExecutionSpace = Kokkos::DefaultHostExecutionSpace>
 
  225   void print_state(std::ostream &s)
 const {
 
  228         std::is_same_v<ExecutionSpace, Kokkos::DefaultHostExecutionSpace>);
 
  230     const size_t alloc_size = m_hint_offset * 
sizeof(uint32_t);
 
  232     uint32_t *
const sb_state_array =
 
  233         accessible ? m_sb_state_array : (uint32_t *)host.
allocate(alloc_size);
 
  236       Kokkos::Impl::DeepCopy<Kokkos::HostSpace, base_memory_space>(
 
  237           ExecutionSpace{}, sb_state_array, m_sb_state_array, alloc_size);
 
  239           "MemoryPool::print_state(): fence after copying state array to " 
  243     Impl::_print_memory_pool_state(s, sb_state_array, m_sb_count, m_sb_size_lg2,
 
  244                                    m_sb_state_size, state_shift,
 
  254   KOKKOS_DEFAULTED_FUNCTION MemoryPool(MemoryPool &&)                 = 
default;
 
  255   KOKKOS_DEFAULTED_FUNCTION MemoryPool(
const MemoryPool &)            = 
default;
 
  256   KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(MemoryPool &&)      = 
default;
 
  257   KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(
const MemoryPool &) = 
default;
 
  259   KOKKOS_INLINE_FUNCTION MemoryPool()
 
  261         m_sb_state_array(nullptr),
 
  264         m_max_block_size_lg2(0),
 
  265         m_min_block_size_lg2(0),
 
  269         m_unused_padding(0) {}
 
  285   MemoryPool(
const base_memory_space &memspace,
 
  286              const size_t min_total_alloc_size, 
size_t min_block_alloc_size = 0,
 
  287              size_t max_block_alloc_size = 0, 
size_t min_superblock_size = 0)
 
  289         m_sb_state_array(nullptr),
 
  292         m_max_block_size_lg2(0),
 
  293         m_min_block_size_lg2(0),
 
  297         m_unused_padding(0) {
 
  298     const uint32_t int_align_lg2               = 3; 
 
  299     const uint32_t int_align_mask              = (1u << int_align_lg2) - 1;
 
  300     const uint32_t default_min_block_size      = 1u << 6;  
 
  301     const uint32_t default_max_block_size      = 1u << 12; 
 
  302     const uint32_t default_min_superblock_size = 1u << 20; 
 
  307     if (0 == min_block_alloc_size) {
 
  310       min_superblock_size =
 
  311           std::min(
size_t(default_min_superblock_size), min_total_alloc_size);
 
  313       min_block_alloc_size =
 
  314           std::min(
size_t(default_min_block_size), min_superblock_size);
 
  316       max_block_alloc_size =
 
  317           std::min(
size_t(default_max_block_size), min_superblock_size);
 
  318     } 
else if (0 == min_superblock_size) {
 
  324       const size_t max_superblock =
 
  325           min_block_alloc_size * max_block_per_superblock;
 
  327       min_superblock_size =
 
  328           std::min(max_superblock,
 
  329                    std::min(
size_t(max_superblock_size), min_total_alloc_size));
 
  332     if (0 == max_block_alloc_size) {
 
  333       max_block_alloc_size = min_superblock_size;
 
  347     Kokkos::Impl::memory_pool_bounds_verification(
 
  348         min_block_alloc_size, max_block_alloc_size, min_superblock_size,
 
  349         max_superblock_size, max_block_per_superblock, min_total_alloc_size);
 
  355     m_min_block_size_lg2 =
 
  356         integral_power_of_two_that_contains(min_block_alloc_size);
 
  358     m_max_block_size_lg2 =
 
  359         integral_power_of_two_that_contains(max_block_alloc_size);
 
  361     m_sb_size_lg2 = integral_power_of_two_that_contains(min_superblock_size);
 
  367       const uint64_t sb_size_mask = (1LU << m_sb_size_lg2) - 1;
 
  369       m_sb_count = (min_total_alloc_size + sb_size_mask) >> m_sb_size_lg2;
 
  376       const uint32_t max_block_count_lg2 = m_sb_size_lg2 - m_min_block_size_lg2;
 
  379           (CB::buffer_bound_lg2(max_block_count_lg2) + int_align_mask) &
 
  385     const size_t all_sb_state_size =
 
  386         (m_sb_count * m_sb_state_size + int_align_mask) & ~int_align_mask;
 
  390     const int32_t number_block_sizes =
 
  391         1 + m_max_block_size_lg2 - m_min_block_size_lg2;
 
  396     const int32_t block_size_array_size =
 
  397         (number_block_sizes + int_align_mask) & ~int_align_mask;
 
  399     m_hint_offset = all_sb_state_size;
 
  400     m_data_offset = m_hint_offset + block_size_array_size * HINT_PER_BLOCK_SIZE;
 
  404     const size_t header_size = m_data_offset * 
sizeof(uint32_t);
 
  405     const size_t alloc_size =
 
  406         header_size + (size_t(m_sb_count) << m_sb_size_lg2);
 
  408     Record *rec = Record::allocate(memspace, 
"Kokkos::MemoryPool", alloc_size);
 
  410     m_tracker.assign_allocated_record_to_uninitialized(rec);
 
  412     m_sb_state_array = (uint32_t *)rec->data();
 
  416     uint32_t *
const sb_state_array =
 
  417         accessible ? m_sb_state_array : (uint32_t *)host.
allocate(header_size);
 
  419     for (int32_t i = 0; i < m_data_offset; ++i) sb_state_array[i] = 0;
 
  423     for (int32_t i = 0; i < number_block_sizes; ++i) {
 
  424       const uint32_t block_size_lg2  = i + m_min_block_size_lg2;
 
  425       const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2;
 
  426       const uint32_t block_state     = block_count_lg2 << state_shift;
 
  427       const uint32_t hint_begin      = m_hint_offset + i * HINT_PER_BLOCK_SIZE;
 
  433       const int32_t jbeg = (i * m_sb_count) / number_block_sizes;
 
  434       const int32_t jend = ((i + 1) * m_sb_count) / number_block_sizes;
 
  436       sb_state_array[hint_begin]     = uint32_t(jbeg);
 
  437       sb_state_array[hint_begin + 1] = uint32_t(jbeg);
 
  439       for (int32_t j = jbeg; j < jend; ++j) {
 
  440         sb_state_array[j * m_sb_state_size] = block_state;
 
  447       Kokkos::Impl::DeepCopy<base_memory_space, Kokkos::HostSpace>(
 
  448           typename base_memory_space::execution_space{}, m_sb_state_array,
 
  449           sb_state_array, header_size);
 
  451           "MemoryPool::MemoryPool(): fence after copying state array from " 
  456       Kokkos::memory_fence();
 
  466   KOKKOS_FORCEINLINE_FUNCTION
 
  467   uint32_t get_block_size_lg2(uint32_t n) 
const noexcept {
 
  468     const unsigned i = integral_power_of_two_that_contains(n);
 
  470     return i < m_min_block_size_lg2 ? m_min_block_size_lg2 : i;
 
  475   KOKKOS_INLINE_FUNCTION
 
  476   uint32_t allocate_block_size(uint64_t alloc_size) 
const noexcept {
 
  477     return alloc_size <= (uint64_t(1) << m_max_block_size_lg2)
 
  478                ? (1UL << get_block_size_lg2(uint32_t(alloc_size)))
 
  493   void *allocate(
size_t alloc_size, int32_t attempt_limit = 1) const noexcept {
 
  494     if ((
size_t(1) << m_max_block_size_lg2) < alloc_size) {
 
  496           "Kokkos MemoryPool allocation request exceeded specified maximum " 
  500     if (0 == alloc_size) 
return nullptr;
 
  504     const uint32_t block_size_lg2 = get_block_size_lg2(alloc_size);
 
  509     const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2;
 
  510     const uint32_t block_state     = block_count_lg2 << state_shift;
 
  511     const uint32_t block_count     = 1u << block_count_lg2;
 
  517     volatile uint32_t *
const hint_sb_id_ptr =
 
  520         + HINT_PER_BLOCK_SIZE 
 
  521               * (block_size_lg2 - m_min_block_size_lg2); 
 
  523     const int32_t sb_id_begin = int32_t(hint_sb_id_ptr[1]);
 
  528 #if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) 
  529     const uint32_t block_id_hint = alloc_size;
 
  531     const uint32_t block_id_hint =
 
  532         (uint32_t)(Kokkos::Impl::clock_tic()
 
  533 #ifdef __CUDA_ARCH__  // FIXME_CUDA 
  536                    + (threadIdx.x + blockDim.x * threadIdx.y)
 
  542     uint32_t sb_state = block_state;
 
  546     volatile uint32_t *sb_state_array = 
nullptr;
 
  548     while (attempt_limit) {
 
  549       int32_t hint_sb_id = -1;
 
  554         sb_id = hint_sb_id = int32_t(*hint_sb_id_ptr);
 
  556         sb_state_array = m_sb_state_array + (sb_id * m_sb_state_size);
 
  563       if (sb_state == (state_header_mask & *sb_state_array)) {
 
  568         const uint32_t count_lg2 = sb_state >> state_shift;
 
  569         const uint32_t mask      = (1u << count_lg2) - 1;
 
  572             sb_state_array, count_lg2, block_id_hint & mask, sb_state);
 
  579         if (0 <= result.
first) {  
 
  581           const uint32_t size_lg2 = m_sb_size_lg2 - count_lg2;
 
  585           p = ((
char *)(m_sb_state_array + m_data_offset)) +
 
  586               (uint64_t(sb_id) << m_sb_size_lg2)       
 
  587               + (uint64_t(result.
first) << size_lg2);  
 
  602       sb_state = block_state;  
 
  605       bool update_hint        = 
false;
 
  606       int32_t sb_id_empty     = -1;
 
  607       int32_t sb_id_large     = -1;
 
  608       uint32_t sb_state_large = 0;
 
  610       sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size;
 
  612       for (int32_t i = 0, 
id = sb_id_begin; i < m_sb_count; ++i) {
 
  617         const uint32_t full_state = *sb_state_array;
 
  618         const uint32_t used       = full_state & state_used_mask;
 
  619         const uint32_t state      = full_state & state_header_mask;
 
  621         if (state == block_state) {
 
  624           if (used < block_count) {
 
  631             update_hint = used + 1 < block_count;
 
  635         } 
else if (0 == used) {
 
  638           if (-1 == sb_id_empty) {
 
  645         } 
else if ((-1 == sb_id_empty ) &&
 
  646                    (-1 == sb_id_large ) &&
 
  647                    (state < block_state ) &&
 
  649                    (used < (1u << (state >> state_shift)))) {
 
  655           sb_state_large = state;
 
  660         if (++
id < m_sb_count) {
 
  661           sb_state_array += m_sb_state_size;
 
  664           sb_state_array = m_sb_state_array;
 
  674         if (0 <= sb_id_empty) {
 
  683           sb_state_array = m_sb_state_array + (sb_id * m_sb_state_size);
 
  688           const uint32_t state_empty = state_header_mask & *sb_state_array;
 
  692               state_empty == Kokkos::atomic_compare_exchange(
 
  693                                  sb_state_array, state_empty, block_state);
 
  694         } 
else if (0 <= sb_id_large) {
 
  698           sb_state = sb_state_large;
 
  700           sb_state_array = m_sb_state_array + (sb_id * m_sb_state_size);
 
  708         Kokkos::atomic_compare_exchange(hint_sb_id_ptr, uint32_t(hint_sb_id),
 
  725   KOKKOS_INLINE_FUNCTION
 
  726   void deallocate(
void *p, 
size_t ) const noexcept {
 
  727     if (
nullptr == p) 
return;
 
  731         static_cast<char *
>(p) -
 
  732         reinterpret_cast<char *>(m_sb_state_array + m_data_offset);
 
  735     const int ok_contains =
 
  736         (0 <= d) && (
size_t(d) < (size_t(m_sb_count) << m_sb_size_lg2));
 
  738     int ok_block_aligned = 0;
 
  739     int ok_dealloc_once  = 0;
 
  742       const int sb_id = d >> m_sb_size_lg2;
 
  745       volatile uint32_t *
const sb_state_array =
 
  746           m_sb_state_array + (sb_id * m_sb_state_size);
 
  748       const uint32_t block_state = (*sb_state_array) & state_header_mask;
 
  749       const uint32_t block_size_lg2 =
 
  750           m_sb_size_lg2 - (block_state >> state_shift);
 
  752       ok_block_aligned = 0 == (d & ((1UL << block_size_lg2) - 1));
 
  754       if (ok_block_aligned) {
 
  759             (d & ((ptrdiff_t(1) << m_sb_size_lg2) - 1)) >> block_size_lg2;
 
  761         const int result = CB::release(sb_state_array, bit, block_state);
 
  763         ok_dealloc_once = 0 <= result;
 
  767     if (!ok_contains || !ok_block_aligned || !ok_dealloc_once) {
 
  768       Kokkos::abort(
"Kokkos MemoryPool::deallocate given erroneous pointer");
 
  774   KOKKOS_INLINE_FUNCTION
 
  775   int number_of_superblocks() const noexcept { 
return m_sb_count; }
 
  777   KOKKOS_INLINE_FUNCTION
 
  778   void superblock_state(
int sb_id, 
int &block_size, 
int &block_count_capacity,
 
  779                         int &block_count_used) 
const noexcept {
 
  781     block_count_capacity = 0;
 
  782     block_count_used     = 0;
 
  784     bool can_access_state_array = []() {
 
  786           (
return SpaceAccessibility<DefaultHostExecutionSpace,
 
  787                                      base_memory_space>::accessible;))
 
  789           (return SpaceAccessibility<DefaultExecutionSpace,
 
  790                                      base_memory_space>::accessible;))
 
  793     if (can_access_state_array) {
 
  796       const uint32_t state =
 
  797           ((uint32_t 
volatile *)m_sb_state_array)[sb_id * m_sb_state_size];
 
  799       const uint32_t block_count_lg2 = state >> state_shift;
 
  800       const uint32_t block_used      = state & state_used_mask;
 
  802       block_size           = 1LU << (m_sb_size_lg2 - block_count_lg2);
 
  803       block_count_capacity = 1LU << block_count_lg2;
 
  804       block_count_used     = block_used;
 
void * allocate(const ExecutionSpace &, const size_t arg_alloc_size) const 
Allocate untracked memory in the space. 
 
Replacement for std::pair that works on CUDA devices. 
 
void deallocate(void *const arg_alloc_ptr, const size_t arg_alloc_size) const 
Deallocate untracked memory in the space. 
 
first_type first
The first element of the pair. 
 
Memory management for host memory. 
 
Declaration of parallel operators. 
 
Access relationship between DstMemorySpace and SrcMemorySpace.