#ifndef TRIPLETS_UTIL_H_ #define TRIPLETS_UTIL_H_ #include #include "layout-ptr.h" /** a macro for checking CUDA calls */ #define cucheck(call) \ { \ cudaError_t cucheck_err = (call); \ if(cucheck_err != cudaSuccess) { \ const char* err_str = cudaGetErrorString(cucheck_err); \ fprintf(stderr, "%s (%d): %s in %s\n", __FILE__, __LINE__, err_str, #call); \ exit(-1); \ } \ } /** a macro for checking CUDA calls on device */ #define cucheck_dev(call) \ { \ cudaError_t cucheck_err = (call); \ if(cucheck_err != cudaSuccess) { \ const char *err_str = cudaGetErrorString(cucheck_err); \ printf("%s (%d): %s\n", __FILE__, __LINE__, err_str); \ assert(0); \ } \ } // #define cucheck_dev(call) \ // { \ // call; \ // } /** a macro to start reconstruction time measurement */ #define reco_time_start() \ double tstart = omp_get_wtime(); #define reco_time_end() \ double tend = omp_get_wtime(); \ printf("total reconstruction time: %.3lf ms\n", (tend - tstart) * 1e3); /** a macro to CUDA device variable */ #define cuset(symbol, T, val) \ { \ void *cuset_addr; \ cucheck(cudaGetSymbolAddress(&cuset_addr, symbol)); \ T cuset_val = (val); \ cucheck(cudaMemcpy(cuset_addr, &cuset_val, sizeof(cuset_val), \ cudaMemcpyHostToDevice)); \ } // cuset #ifdef __CUDACC__ #define hostdevice__ __host__ __device__ #else #define hostdevice__ #endif static inline hostdevice__ int divup(int a, int b) { return a / b + (a % b ? 1 : 0); } /** warp size TODO: redefine for future architectures */ #define WARP_SIZE 32 /** quick computation of lane_id; blockDim.x should be > WARP_SIZE */ static inline __device__ int lane_id(void) { return threadIdx.x % WARP_SIZE; // int lid; // asm("mov.u32 %0, %%laneid;" : "=r" (lid)); // return lid; } /** gets the mask of active lanes in the warp */ static inline __device__ int warp_mask(void) { return __ballot(1); } /** gets the count of active lanes from the mask */ static inline __device__ int nactive_lanes(int mask) { return __popc(mask); } /** gets the current leader based on the active mask */ static inline __device__ int warp_leader(int mask) { return __ffs(mask) - 1; } /** broadcasts the value from the leader across the warp */ static inline __device__ int warp_bcast(int val, int leader) { return __shfl(val, leader); } /** gets the 0-based number of the lane among the active lanes of the warp */ static inline __device__ int active_lid(int mask) { return __popc(mask & ((1 << (lane_id())) - 1)); } /** warp-aggregated atomic increment; return result is equivalent (but not equal) to each thread doing atomic increment individually */ static inline __device__ int atomicAggInc(int *p) { //return atomicAdd(p, 1); int lid = lane_id(); int mask = warp_mask(); int leader_lid = warp_leader(mask); // result value int res; if(lid == leader_lid) res = atomicAdd(p, nactive_lanes(mask)); res = warp_bcast(res, leader_lid); // increment the result based on id of active thread within the warp res += active_lid(mask); return res; } // atomicAggInc /** warp-aggregated atomic increment with multiple counters; ictr shouldn't vary a lot within a warp, as warp aggregation will then bring nothing but overhead */ static inline __device__ int atomicAggInc(int *ctrs, int ictr) { // each thread should get the leader and the mask int lid = lane_id(), leader = -1, mask, cur_mask; while(cur_mask = ballot(leader == -1)) if(leader == -1) { leader = warp_leader(cur_mask); int leader_ictr = warp_bcast(ictr, leader); if(ictr != leader_ictr) leader = -1; mask = __ballot(ictr == leader_ictr); } // leaders increment their counters int res; if(lid == leader) res = atomicAdd(&ctrs[ictr], nactive_lanes(mask)); // broadcast the result res = warp_bcast(res, leader); res += active_lid(mask); return res; } // atomicAggInc // layouting-related definitions using namespace layout_ptr; /** overall dominant layout */ //typedef aos Tl; typedef soa Tl; /** the layout used for hits in BunchCtx */ typedef Tl TlHit; //typedef aos TlHit; /** the layout used for (candidate) tracks in BunchCtx */ typedef Tl TlTrack; //typedef soa TlTrack; typedef Tl TlTube; typedef Tl TlTriplet; #endif