#ifndef TRIPLETS_UTIL_H_
#define TRIPLETS_UTIL_H_

#include <assert.h>

#include "layout-ptr.h"

/** a macro for checking CUDA calls */
#define cucheck(call)																										\
	{																																			\
	cudaError_t cucheck_err = (call);																			\
	if(cucheck_err != cudaSuccess) {																			\
		const char* err_str = cudaGetErrorString(cucheck_err);							\
		fprintf(stderr, "%s (%d): %s in %s\n", __FILE__, __LINE__, err_str, #call);	\
		exit(-1);																														\
	}																																			\
	}

/** a macro for checking CUDA calls on device */
#define cucheck_dev(call)																		\
{																														\
	cudaError_t cucheck_err = (call);													\
	if(cucheck_err != cudaSuccess) {													\
		const char *err_str = cudaGetErrorString(cucheck_err);	\
		printf("%s (%d): %s\n", __FILE__, __LINE__, err_str);		\
		assert(0);																							\
	}																													\
}

// #define cucheck_dev(call)																		\
// {																														\
// 	call;													\
// }


/** a macro to start reconstruction time measurement */
#define reco_time_start()												\
double tstart = omp_get_wtime();

#define reco_time_end()																									\
double tend = omp_get_wtime();																					\
printf("total reconstruction time: %.3lf ms\n", (tend - tstart) * 1e3);

/** a macro to CUDA device variable */
#define cuset(symbol, T, val)																		\
{																																\
	void *cuset_addr;																							\
	cucheck(cudaGetSymbolAddress(&cuset_addr, symbol));						\
	T cuset_val = (val);																					\
	cucheck(cudaMemcpy(cuset_addr, &cuset_val, sizeof(cuset_val), \
										 cudaMemcpyHostToDevice));									\
}  // cuset

#ifdef __CUDACC__
#define hostdevice__ __host__ __device__
#else
#define hostdevice__
#endif

static inline hostdevice__ int divup(int a, int b) {
	return a / b + (a % b ? 1 : 0);
}

/** warp size TODO: redefine for future architectures */
#define WARP_SIZE 32

/** quick computation of lane_id; blockDim.x should be > WARP_SIZE */
static inline __device__ int lane_id(void) {
	return threadIdx.x % WARP_SIZE;
	// int lid;
	// asm("mov.u32 %0, %%laneid;" : "=r" (lid));
	// return lid;
}

/** gets the mask of active lanes in the warp */
static inline __device__ int warp_mask(void) {
	return __ballot(1);
}

/** gets the count of active lanes from the mask */
static inline __device__ int nactive_lanes(int mask) {
	return __popc(mask);
}

/** gets the current leader based on the active mask */
static inline __device__ int warp_leader(int mask) {
	return __ffs(mask) - 1;
}

/** broadcasts the value from the leader across the warp */
static inline __device__ int warp_bcast(int val, int leader) {
	return __shfl(val, leader);
}

/** gets the 0-based number of the lane among the active lanes of the warp */
static inline __device__ int active_lid(int mask) {
	return __popc(mask & ((1 << (lane_id())) - 1));
}

/** warp-aggregated atomic increment; return result is equivalent (but not
		equal) to each thread doing atomic increment individually */
static inline __device__ int atomicAggInc(int *p) {
	//return atomicAdd(p, 1);
	int lid = lane_id();
	int mask = warp_mask();
	int leader_lid = warp_leader(mask);
	// result value
	int res;
	if(lid == leader_lid)
		res = atomicAdd(p, nactive_lanes(mask));
	res = warp_bcast(res, leader_lid);
	// increment the result based on id of active thread within the warp
	res += active_lid(mask);
	return res;
}  // atomicAggInc

/** warp-aggregated atomic increment with multiple counters; ictr shouldn't vary
		a lot within a warp, as warp aggregation will then bring nothing but
		overhead */
static inline __device__ int atomicAggInc(int *ctrs, int ictr) {
	// each thread should get the leader and the mask
	int lid = lane_id(), leader = -1, mask, cur_mask;
	while(cur_mask = ballot(leader == -1))
		if(leader == -1) {
			leader = warp_leader(cur_mask);
			int leader_ictr = warp_bcast(ictr, leader);
			if(ictr != leader_ictr)
				leader = -1;
			mask = __ballot(ictr == leader_ictr);
		}
	// leaders increment their counters
	int res;
	if(lid == leader)
		res = atomicAdd(&ctrs[ictr], nactive_lanes(mask));
	// broadcast the result
	res = warp_bcast(res, leader);
	res += active_lid(mask);
	return res;
}  // atomicAggInc

// layouting-related definitions
using namespace layout_ptr;

/** overall dominant layout */
//typedef aos Tl;
typedef soa Tl;

/** the layout used for hits in BunchCtx */
typedef Tl TlHit;
//typedef aos TlHit;

/** the layout used for (candidate) tracks in BunchCtx */
typedef Tl TlTrack;
//typedef soa TlTrack;

typedef Tl TlTube;

typedef Tl TlTriplet;

#endif