#ifndef LAYOUT_H_ #define LAYOUT_H_ /** @file layout.h Definitions for SoA/AoS flexible layout. This code has been developed along the lines of ideas of Robert Strzodka. The original ideas are described in: Abstraction for AoS and SoA Layout in C++, in: W.-M. W. Hwu (Ed.), GPU Computing Gems Jade Edition, Morgan Kaufmann, 2011, pp. 429 – 443. Description of these ideas, together with original code, is also available on Robert Strzodka's webpage: http://people.mpi-inf.mpg.de/~strzodka/projects/layout/ */ #define __hostdevice__ __host__ __device__ /** information on a specific flexible type; should be overridden outside of any namespace */ template struct layflex_t {}; namespace layout_ptr { // utility functions namespace layout_utils { /** division with rounding to infinity (for positive a and b) */ static inline __hostdevice__ int laydivup(int a, int b) { return a / b + (a % b ? 1 : 0); } /** division with rounding to infinity (for positive a and b) */ static inline __hostdevice__ size_t laydivup(size_t a, size_t b) { return a / b + (a % b ? (size_t)1 : (size_t)0); } /** rounding to nearest multiple of b to infinity (for positive a and b) */ static inline __hostdevice__ size_t layroundup(size_t a, size_t b) { return laydivup(a, b) * b; } /** modulo guaranteed to be between 0 and b-1 (both ends inclusive) for positive b, where b is a power of 2 */ static inline __hostdevice__ int laymodp2(ptrdiff_t a, int b) { return (int)(a & (b - 1)); } /** division operation companion to modp2, so it shall hold that divp2(a, b) * b + modp2(a, b) == a */ static inline __hostdevice__ ptrdiff_t laydivp2(ptrdiff_t a, int b) { return (a - laymodp2(a, b)) / b; } //const uint ng = 128; const uint ng = 128; } // namespace layout_utils using namespace layout_utils; /** Base class for layouts. */ class layout {}; /** Array of Structures (AoS) layout. */ class aos : public layout {}; /** Identical layout, not exactly equivalent to AoS. Note that this layout should not be used for container or pointer definitions. */ class id_layout : public layout {}; /** Structure of Arrays (SoA) layout. */ class soa : public layout {}; template struct laygroup_t {}; template struct laygroup_t { char dummy; }; template struct laygroup_t { char dummy; }; template struct laygroup_t { T dummy[ng]; }; /** basic layout pointer, useful in specializations only */ template struct layptr {}; /** Layouted memory allocation and copying. All sizes are in elements, not in bytes. lay_utils static functions are to be overridden by different layouts, while lay_* global functions are to be used externally. @remarks: only phase-0 pointers and multiple-of-ng copies are supported for SoA allocations */ template struct lay_utils { static layptr alloc(size_t nels); static void free(layptr p); static cudaError_t cuda_alloc(layptr* p, size_t nels); static cudaError_t cuda_free(layptr p); static cudaError_t cuda_memcpy (layptr to, layptr from, size_t nels, cudaMemcpyKind kind); }; // lay_utils template layptr lay_alloc(size_t nels) { return lay_utils::alloc(nels); } template void lay_free(layptr p) { lay_utils::free(p); } template cudaError_t cuda_alloc(layptr* p, size_t nels) { return lay_utils::cuda_alloc(p, nels); } template cudaError_t cuda_free(layptr p) { return lay_utils::cuda_free(p); } template cudaError_t host_alloc(layptr* p, size_t nels) { return lay_utils::host_alloc(p, nels); } template cudaError_t host_free(layptr p) { return lay_utils::host_free(p); } template cudaError_t cuda_copy (layptr to, layptr from, size_t nels, cudaMemcpyKind kind) { return lay_utils::cuda_copy(to, from, nels, kind); } /** AoS-layouted pointer; T should be the "element" type (flexible type parameterized with id_layout), and not the flexible element */ template struct layptr { /** element type, the type "pointed to" from the point of view of the developer */ //typedef typename T::elem_t elem_t; typedef T elem_t; /** current layout type */ typedef aos layout_t; /** flexible element type parameterized by the current layout */ typedef typename layflex_t::flex_t flex_t; /** reference type, the type returned by reference, i.e. by * and [] operators */ typedef elem_t &ref_t; /** pointer type, the type returned by -> operator */ typedef elem_t *ptr_t; /** block_t, the type used for block storage */ typedef elem_t block_t; /** the underlying pointer */ block_t *p; /** construct from an external pointer; prevent casts */ __hostdevice__ explicit layptr(elem_t *p) : p(p) {} /** default constructor, leaves uninitialized */ __hostdevice__ layptr() {} /** increment/decrement operators, both prefix and postfix */ __hostdevice__ layptr operator++() { return layptr(++p); } __hostdevice__ layptr operator--() { return layptr(--p); } __hostdevice__ layptr operator++(int) { layptr r(p++); return r; } __hostdevice__ layptr operator--(int) { layptr r(p++); return r; } /** dereferencing operator */ __hostdevice__ ref_t operator *() const { return *p; } /** member access operator */ __hostdevice__ ptr_t operator->() const { return p; } /** indexing operator */ __hostdevice__ ref_t operator[](size_t i) const { return p[i]; } }; // AoS-layouted pointer /** AoS pointer arithmetic */ template inline __hostdevice__ layptr operator+(layptr a, ptrdiff_t b) { return layptr(a.p + b); } template inline __hostdevice__ layptr operator+(ptrdiff_t b, layptr a) { return layptr(b + a.p); } template inline __hostdevice__ layptr operator-(layptr a, ptrdiff_t b) { return layptr(a.p - b); } template inline __hostdevice__ ptrdiff_t operator-(layptr a, layptr b) { return b.p - a.p; } /** lay_utils specialized for AoS layout */ template struct lay_utils { static inline layptr alloc(size_t nels) { return layptr((T *)malloc(nels * sizeof(T))); } static inline void free(layptr p) { ::free(p.p); } static inline cudaError_t cuda_alloc(layptr* p, size_t nels) { return cudaMalloc((void **)&p->p, nels * sizeof(T)); } static inline cudaError_t cuda_free(layptr p) { return cudaFree(p.p); } static inline cudaError_t host_alloc(layptr* p, size_t nels) { return cudaMallocHost((void **)&p->p, nels * sizeof(T)); } static inline cudaError_t host_free(layptr p) { return cudaFreeHost(p.p); } static inline cudaError_t cuda_copy (layptr to, layptr from, size_t nels, cudaMemcpyKind kind) { return cudaMemcpy(to.p, from.p, nels * sizeof(T), kind); } }; // end of lay_utils /** SoA-layouted pointer */ template struct layptr { /** element type, the type "pointed to" from the point of view of the developer */ typedef T elem_t; /** current layout type */ typedef soa layout_t; /** flexible element type parameterized by the current layout */ typedef typename layflex_t::flex_t flex_t; /** reference type, the type returned by reference, i.e. by * and [] operators */ typedef flex_t &ref_t; /** pointer type, the type returned by -> operator */ typedef flex_t *ptr_t; /** block_t, the type used for block storage */ typedef flex_t block_t; /** field_t, the type used to shift within the block */ typedef typename T::layfield_t field_t; /** the pointer to the block */ block_t *p0; /** the phase, which indicates the displacements within a single ng-sized group; the phase is in units of */ uint phase; /** constructor for public use, only phase 0 allowed */ __hostdevice__ explicit layptr(elem_t *p) : p0((block_t *)p), phase(0) {} /** constructor for internal use, any phase allowed */ __hostdevice__ layptr(block_t *p0, uint phase) : p0(p0), phase(phase) {} /** default constructor, leaves everything uninitialized */ __hostdevice__ layptr() {} /** dereferencing operator */ __hostdevice__ ref_t operator *() const { return (ref_t)(*((field_t *)p0 + phase)); } /** member access operator */ __hostdevice__ ptr_t operator->() const { return (ptr_t)((field_t *)p0 + phase); } /** indexing operator */ __hostdevice__ ref_t operator[](size_t i) const { size_t i1 = i + phase; return (ref_t)(*((field_t *)(p0 + i1 / ng) + i1 % ng)); } /** increment/decrement operators */ __hostdevice__ void incr() { phase++; if(phase == ng) { p0++; phase = 0; } } // incr __hostdevice__ void decr() { if(phase == 0) { p0--; phase = ng; } phase--; } // decr __hostdevice__ layptr operator++() { this->incr(); return *this; } // operator++ __hostdevice__ layptr operator--() { this->decr(); return *this; } // operator-- __hostdevice__ layptr operator++(int) { layptr res = *this; this->incr(); return res; } // operator++ __hostdevice__ layptr operator--(int) { layptr res = *this; this->decr(); return res; } // operator-- }; // layptr /** SoA pointer arithmetic */ template __hostdevice__ inline layptr operator+(layptr a, ptrdiff_t b) { // NOTE: works only if ng is a power of 2 ptrdiff_t i1 = a.phase + b; return layptr(a.p0 + laydivp2(i1, ng), laymodp2(i1, ng)); } // operator+ template __hostdevice__ inline layptr operator+(ptrdiff_t a, layptr b) { return b + a; } // operator+ template __hostdevice__ inline layptr operator-(layptr a, ptrdiff_t b) { return a + (-b); } // operator+ template __hostdevice__ inline ptrdiff_t operator-(layptr a, layptr b) { return (a.p0 - b.p0) * (ptrdiff_t)ng + (a.phase - b.phase); } /** lay_utils specialized for SoA layout */ template struct lay_utils { static inline size_t size(size_t nels) { return laydivup(nels, ng) * sizeof(typename layptr::block_t); } static inline layptr alloc(size_t nels) { return layptr((T *)malloc(size(nels))); } static inline void free(layptr p) { ::free(p.p0); } static inline cudaError_t cuda_alloc(layptr* p, size_t nels) { cudaError_t res = cudaMalloc((void **)&p->p0, size(nels)); if(res == cudaSuccess) p->phase = 0; return res; } static inline cudaError_t cuda_free(layptr p) { return cudaFree(p.p0); } static inline cudaError_t host_alloc(layptr* p, size_t nels) { cudaError_t res = cudaMallocHost((void **)&p->p0, size(nels)); if(res == cudaSuccess) p->phase = 0; return res; } static inline cudaError_t host_free(layptr p) { return cudaFreeHost(p.p0); } static inline cudaError_t cuda_copy (layptr to, layptr from, size_t nels, cudaMemcpyKind kind) { return cudaMemcpy(to.p0, from.p0, size(nels), kind); } }; } // namespace layout_ptr #undef __hostdevice__ #endif