/** @file bunch-ctx-kernels.cu.h common kernel definitions to be called on device with dynamic parallelism processing, and on host with host stream processing */ /** this kernel computes histogram of hits by tubes */ __global__ void tube_hit_histo_k(BunchCtx *b) { int ihit = threadIdx.x + blockIdx.x * blockDim.x; b->tube_hit_histo_tkfun(ihit); } // tube_hit_histo_k /** this kernel groups hits by tubes */ __global__ void tube_hit_group_k(BunchCtx *b) { int ihit = threadIdx.x + blockIdx.x * blockDim.x; b->tube_hit_group_tkfun(ihit); } // tube_hit_group_k /** kernel to find tubes with non-zero hit counts */ __global__ void nz_tubes_k(BunchCtx *b) { int itube = threadIdx.x + blockIdx.x * blockDim.x; b->nz_tubes_tkfun(itube); } // nz_tubes /** a kernel to find triplets */ __global__ void find_triplets_k(BunchCtx *b) { int ihit = threadIdx.x + blockIdx.x * blockDim.x; b->find_triplets_tkfun(ihit); } // find_triplets_k /** a kernel to find half-skewelets */ __global__ void find_half_skewlets_k(BunchCtx *b) { int ihit = threadIdx.x + blockIdx.x * blockDim.x; b->find_half_skewlets_tkfun(ihit); } // find_half_skewlets_k /** a kernel to combine half-skewlets without poca test */ __global__ void combine_half_skewlets_k(BunchCtx *b, int irange) { int i1half = blockIdx.x * blockDim.x; int j1half = blockIdx.y * blockDim.y; int lihalf = threadIdx.x, ljhalf = threadIdx.y; int ihalf = lihalf + i1half; int jhalf = ljhalf + j1half; b->combine_half_skewlets_tkfun(irange, ihalf, jhalf); } // combine_half_skewlets_k /** separate kernel for poca test on half-skewlet pairs; separating it from combine_half_skewlets_k reduces branch divergence and improves performance of finding skewlets */ __global__ void poca_test_k(BunchCtx *b, int irange) { int ipair = threadIdx.x + blockIdx.x * blockDim.x; b->poca_test_tkfun(irange, ipair, true); } // poca_test_k /** computes the circles for the tracks found */ __global__ void compute_circles_k (layptr cand_tracks, int ncand_tracks) { int itrack = threadIdx.x + blockIdx.x * blockDim.x; compute_circles_tkfun(cand_tracks, ncand_tracks, itrack); } // compute_circles_k /** evaluates tracks against tubes using shared memory */ __global__ void eval_tracks_tubes_shared_k(BunchCtx *b) { int itrack = threadIdx.x + blockIdx.x * blockDim.x; b->eval_tracks_tubes_shared_tkfun(itrack); } // eval_tracks_tubes_shared_k /** evaluates tracks against tubes using "rasterization" */ #if __CUDA_ARCH__ == 350 __launch_bounds__(EVAL_TRACKS_BS,EVAL_TRACKS_RASTER_MIN_BLOCKS) #endif __global__ void eval_tracks_tubes_raster_k(BunchCtx *b) { int itrack = threadIdx.x + blockIdx.x * blockDim.x; b->eval_tracks_tubes_raster_tkfun(itrack); } // eval_tracks_tubes_raster_k /** evaluates tracks against all skewlets */ // #if __CUDA_ARCH__ == 350 // __launch_bounds__(EVAL_TRACKS_BS,EVAL_TRACKS_SKEWLETS_MIN_BLOCKS) // #endif __global__ void eval_tracks_skewlets_all_k(BunchCtx *b) { int itrack = threadIdx.x + blockIdx.x * blockDim.x; b->eval_tracks_skewlets_all_tkfun(itrack); //b->eval_tracks_skewlets_bins_tkfun(itrack); } // eval_tracks_skewlets_k /** evaluates tracks against skewlets with bins */ __global__ void eval_tracks_skewlets_bins_k(BunchCtx *b) { int itrack = threadIdx.x + blockIdx.x * blockDim.x; //b->eval_tracks_skewlets_full_tkfun(itrack); b->eval_tracks_skewlets_bins_tkfun(itrack); } // eval_tracks_skewlets_k /** this kernel combines two layers of triplets */ __global__ void combine_triplets_k (BunchCtx *b, int inner_id, int outer_id, float max_phi) { int tinner = threadIdx.x + blockDim.x * blockIdx.x; int touter = threadIdx.y + blockDim.y * blockIdx.y; b->combine_triplets_tkfun(inner_id, outer_id, max_phi, tinner, touter); } // combine_triplets_k