#ifndef LAYOUT_H_
#define LAYOUT_H_

/** @file layout.h Definitions for SoA/AoS flexible layout. 
		
		This code has been developed along the lines of ideas of Robert
		Strzodka. The original ideas are described in:

		Abstraction for AoS and SoA Layout in
		C++, in: W.-M. W. Hwu (Ed.), GPU Computing Gems Jade Edition, Morgan
		Kaufmann, 2011, pp. 429 – 443.

		Description of these ideas, together with original code, is also available
		on Robert Strzodka's webpage:

		http://people.mpi-inf.mpg.de/~strzodka/projects/layout/
*/

#define __hostdevice__ __host__ __device__ 

/** information on a specific flexible type; should be overridden outside of any
		namespace */
template<class Tl, class T> struct layflex_t {};

namespace layout_ptr {

	// utility functions
  namespace layout_utils {
		/** division with rounding to infinity (for positive a and b) */
		static inline __hostdevice__ int laydivup(int a, int b) {
			return a / b + (a % b ? 1 : 0);
		}
		/** division with rounding to infinity (for positive a and b) */
		static inline __hostdevice__ size_t laydivup(size_t a, size_t b) {
			return a / b + (a % b ? (size_t)1 : (size_t)0);
		}
		/** rounding to nearest multiple of b to infinity (for positive a and b) */
		static inline __hostdevice__ size_t layroundup(size_t a, size_t b) {
			return laydivup(a, b) * b;
		}
		/** modulo guaranteed to be between 0 and b-1 (both ends inclusive) for
				positive b, where b is a power of 2 */
		static inline __hostdevice__ int laymodp2(ptrdiff_t a, int b) {
			return (int)(a & (b - 1));
		}
		/** division operation companion to modp2, so it shall hold that
		 divp2(a, b) * b + modp2(a, b) == a */
		static inline __hostdevice__ ptrdiff_t laydivp2(ptrdiff_t a, int b) {
			return (a - laymodp2(a, b)) / b;
		}
		//const uint ng = 128;
		const uint ng = 128;
	}  // namespace layout_utils
	
	using namespace layout_utils;

	/** Base class for layouts. */
	class layout {};

	/** Array of Structures (AoS) layout. */
	class aos : public layout {};

	/** Identical layout, not exactly equivalent to AoS. Note that this layout
			should not be used for container or pointer definitions. */
	class id_layout : public layout {};

	/** Structure of Arrays (SoA) layout. */
	class soa : public layout {};

	template<class Tl, class T> struct laygroup_t {};

	template<class T> struct laygroup_t<aos, T> { char dummy; };
	template<class T> struct laygroup_t<id_layout, T> { char dummy; };
	template<class T> struct laygroup_t<soa, T> { T dummy[ng]; };
		
	/** basic layout pointer, useful in specializations only */
	template<class Tl, class T> struct layptr {};
	
	/** Layouted memory allocation and copying. All sizes are in elements, not in
			bytes. lay_utils static functions are to be overridden by different
			layouts, while lay_* global functions are to be used externally. 
			@remarks: only phase-0 pointers and multiple-of-ng copies are supported for
			SoA allocations
	 */
	template<class Tl, class T> struct lay_utils {
		static layptr<Tl, T> alloc(size_t nels);
		static void free(layptr<Tl, T> p);
		static cudaError_t cuda_alloc(layptr<Tl, T>* p, size_t nels);
		static cudaError_t cuda_free(layptr<Tl, T> p);
		static cudaError_t cuda_memcpy
		(layptr<Tl, T> to, layptr<Tl, T> from, size_t nels, cudaMemcpyKind kind);	
	};  // lay_utils
	
	template<class Tl, class T> layptr<Tl, T> lay_alloc(size_t nels) {
		return lay_utils<Tl, T>::alloc(nels);
	}
	template<class Tl, class T> void lay_free(layptr<Tl, T> p) {
		lay_utils<Tl, T>::free(p);
	}
	template<class Tl, class T> 
	cudaError_t cuda_alloc(layptr<Tl, T>* p, size_t nels) {
		return lay_utils<Tl, T>::cuda_alloc(p, nels);
	}
	template<class Tl, class T>
	cudaError_t cuda_free(layptr<Tl, T> p) {
		return lay_utils<Tl, T>::cuda_free(p);
	}
	template<class Tl, class T> 
	cudaError_t host_alloc(layptr<Tl, T>* p, size_t nels) {
		return lay_utils<Tl, T>::host_alloc(p, nels);
	}
	template<class Tl, class T>
	cudaError_t host_free(layptr<Tl, T> p) {
		return lay_utils<Tl, T>::host_free(p);
	}
	template<class Tl, class T>
	cudaError_t cuda_copy
	(layptr<Tl, T> to, layptr<Tl, T> from, size_t nels, cudaMemcpyKind kind) {
		return lay_utils<Tl, T>::cuda_copy(to, from, nels, kind);
	}
	
	/** AoS-layouted pointer; T should be the "element" type (flexible type
			parameterized with id_layout), and not the flexible element */
	template<class T> struct layptr<aos, T> {
		/** element type, the type "pointed to" from the point of view of the
				developer */
		//typedef typename T::elem_t elem_t;
		typedef T elem_t;
		/** current layout type */
		typedef aos layout_t;
		/** flexible element type parameterized by the current layout */
		typedef typename layflex_t<aos, T>::flex_t flex_t;
		/** reference type, the type returned by reference, i.e. by * and [] operators
				*/
		typedef elem_t &ref_t;
		/** pointer type, the type returned by -> operator */
		typedef elem_t *ptr_t;
		/** block_t, the type used for block storage */
		typedef elem_t block_t;
	
		/** the underlying pointer */
		block_t *p;
	
		/** construct from an external pointer; prevent casts */
		__hostdevice__ explicit layptr(elem_t *p) : p(p) {}
		/** default constructor, leaves uninitialized */
		__hostdevice__ layptr() {}
		
		/** increment/decrement operators, both prefix and postfix */
		__hostdevice__ layptr<aos, T> operator++() { return layptr<aos, T>(++p); }
		__hostdevice__ layptr<aos, T> operator--() { return layptr<aos, T>(--p); }
		__hostdevice__ layptr<aos, T> operator++(int) { 
			layptr<aos, T> r(p++);
			return r;
		}
		__hostdevice__ layptr<aos, T> operator--(int) {
			layptr<aos, T> r(p++);
			return r;
		}
	
		/** dereferencing operator */
		__hostdevice__ ref_t operator *() const { return *p; }
	
		/** member access operator */
		__hostdevice__ ptr_t operator->() const { return p; }
	
		/** indexing operator */
		__hostdevice__ ref_t operator[](size_t i) const { return p[i]; }
	
	}; // AoS-layouted pointer
	
	/** AoS pointer arithmetic */
	template<class T>
	inline __hostdevice__ layptr<aos, T> operator+(layptr<aos, T> a, ptrdiff_t b) {
		return layptr<aos, T>(a.p + b);
	}
	template<class T>
	inline __hostdevice__ layptr<aos, T> operator+(ptrdiff_t b, layptr<aos, T> a) {
		return layptr<aos, T>(b + a.p);
	}
	template<class T>
	inline __hostdevice__ layptr<aos, T> operator-(layptr<aos, T> a, ptrdiff_t b) {
		return layptr<aos, T>(a.p - b);
	}
	template<class T>
	inline __hostdevice__ ptrdiff_t operator-(layptr<aos, T> a, layptr<aos, T> b) {
		return b.p - a.p;
	}

	/** lay_utils specialized for AoS layout */
	template<class T> struct lay_utils<aos, T> {
		static inline layptr<aos, T> alloc(size_t nels) {
			return layptr<aos, T>((T *)malloc(nels * sizeof(T)));
		}
		static inline void free(layptr<aos, T> p) {
			::free(p.p);
		}
		static inline	cudaError_t cuda_alloc(layptr<aos, T>* p, size_t nels) {
			return cudaMalloc((void **)&p->p, nels * sizeof(T));
		}
		static inline	cudaError_t cuda_free(layptr<aos, T> p) {
			return cudaFree(p.p);
		}
		static inline	cudaError_t host_alloc(layptr<aos, T>* p, size_t nels) {
			return cudaMallocHost((void **)&p->p, nels * sizeof(T));
		}
		static inline	cudaError_t host_free(layptr<aos, T> p) {
			return cudaFreeHost(p.p);
		}
		static inline	cudaError_t cuda_copy
		(layptr<aos, T> to, layptr<aos, T> from, size_t nels, cudaMemcpyKind kind) {
			return cudaMemcpy(to.p, from.p, nels * sizeof(T), kind);
		}
	};  // end of lay_utils
	
	/** SoA-layouted pointer */
	template<class T> struct layptr<soa, T> {
	
		/** element type, the type "pointed to" from the point of view of the
				developer */
		typedef T elem_t;
		/** current layout type */
		typedef soa layout_t;
		/** flexible element type parameterized by the current layout */
		typedef typename layflex_t<soa, T>::flex_t flex_t;
		/** reference type, the type returned by reference, i.e. by * and [] operators
				*/
		typedef flex_t &ref_t;
		/** pointer type, the type returned by -> operator */
		typedef flex_t *ptr_t;
		/** block_t, the type used for block storage */
		typedef flex_t block_t;
		/** field_t, the type used to shift within the block */
		typedef typename T::layfield_t field_t;
	
		/** the pointer to the block  */
		block_t *p0;
		/** the phase, which indicates the displacements within a single ng-sized
		group; the phase is in units of  */
		uint phase;
	
		/** constructor for public use, only phase 0 allowed */
		__hostdevice__ explicit layptr(elem_t *p) : p0((block_t *)p), phase(0) {}
		/** constructor for internal use, any phase allowed */
		__hostdevice__ layptr(block_t *p0, uint phase) : p0(p0), phase(phase) {}
		/** default constructor, leaves everything uninitialized */
		__hostdevice__ layptr() {}
	
		/** dereferencing operator */
		__hostdevice__ ref_t operator *() const { 
			return (ref_t)(*((field_t *)p0 + phase));
		}
		/** member access operator */
		__hostdevice__ ptr_t operator->() const { 
			return (ptr_t)((field_t *)p0 + phase);
		}
		/** indexing operator */
		__hostdevice__ ref_t operator[](size_t i) const {
			size_t i1 = i + phase;
			return (ref_t)(*((field_t *)(p0 + i1 / ng) + i1 % ng));
		}
		/** increment/decrement operators */
		__hostdevice__ void incr() {
			phase++;
			if(phase == ng) {
				p0++;
				phase = 0;
			}
		}  // incr
		__hostdevice__ void decr() {
			if(phase == 0) {
				p0--;
				phase = ng;
			}
			phase--;
		}  // decr
		__hostdevice__ layptr<soa, T> operator++() {
			this->incr();
			return *this;
		}  // operator++
		__hostdevice__ layptr<soa, T> operator--() {
			this->decr();
			return *this;
		}  // operator--
		__hostdevice__ layptr<soa, T> operator++(int) {
			layptr<soa, T> res = *this;
			this->incr();
			return res;
		}  // operator++
		__hostdevice__ layptr<soa, T> operator--(int) {
			layptr<soa, T> res = *this;
			this->decr();
			return res;
		}  // operator--
	};  // layptr<soa>

	/** SoA pointer arithmetic */
	template<class T> __hostdevice__ inline
	layptr<soa, T> operator+(layptr<soa, T> a, ptrdiff_t b) {
		// NOTE: works only if ng is a power of 2
		ptrdiff_t i1 = a.phase + b;
		return layptr<soa, T>(a.p0 + laydivp2(i1, ng), laymodp2(i1, ng));
	}  // operator+
	template<class T>	__hostdevice__ inline 
	layptr<soa, T> operator+(ptrdiff_t a, layptr<soa, T> b) {
		return b + a;
	}  // operator+
	template<class T> __hostdevice__ inline
	layptr<soa, T> operator-(layptr<soa, T> a, ptrdiff_t b) {
		return a + (-b);
	}  // operator+
	template<class T> __hostdevice__ inline
	ptrdiff_t operator-(layptr<soa, T> a, layptr<soa, T> b) {
		return (a.p0 - b.p0) * (ptrdiff_t)ng + (a.phase - b.phase);
	}
	
	/** lay_utils specialized for SoA layout */
	template<class T> struct lay_utils<soa, T> {
		static inline size_t size(size_t nels) {
			return laydivup(nels, ng) * sizeof(typename layptr<soa, T>::block_t);
		}
		static inline layptr<soa, T> alloc(size_t nels) {
			return layptr<soa, T>((T *)malloc(size(nels)));
		}
		static inline void free(layptr<soa, T> p) {
			::free(p.p0);
		}
		static inline	cudaError_t cuda_alloc(layptr<soa, T>* p, size_t nels) {
			cudaError_t res = cudaMalloc((void **)&p->p0, size(nels));
			if(res == cudaSuccess)
				p->phase = 0;
			return res;
		}
		static inline	cudaError_t cuda_free(layptr<soa, T> p) {
			return cudaFree(p.p0);
		}
		static inline	cudaError_t host_alloc(layptr<soa, T>* p, size_t nels) {
			cudaError_t res = cudaMallocHost((void **)&p->p0, size(nels));
			if(res == cudaSuccess)
				p->phase = 0;
			return res;
		}
		static inline	cudaError_t host_free(layptr<soa, T> p) {
			return cudaFreeHost(p.p0);
		}
		static inline	cudaError_t cuda_copy
		(layptr<soa, T> to, layptr<soa, T> from, size_t nels, cudaMemcpyKind kind) {
			return cudaMemcpy(to.p0, from.p0, size(nels), kind);
		}
	};  
	
}  // namespace layout_ptr

#undef __hostdevice__

#endif