/** @file bunch-ctx-dyn.cu.h kernels and methods for track reconstruction with dynamic parallelism */ // kernels // global reconstruction kernel __global__ void reconstruct_tblock_k(void) { const GlobalCtx *g = &global_ctx_g; int ibunch = blockIdx.x; BunchCtx *bunch = &g->d_bunches[ibunch]; bunch->reconstruct_tblock(); } // reconstruct_k /** this "kernel" computes histogram of hits by tubes */ __device__ void tube_hit_histo_ktblock(BunchCtx *b) { //int ihit = threadIdx.x + blockIdx.x * blockDim.x; for(int ihit = threadIdx.x; ihit < b->nhits; ihit += blockDim.x) b->tube_hit_histo_tkfun(ihit); } // tube_hit_histo_k /** this kernel groups hits by tubes */ __device__ void tube_hit_group_ktblock(BunchCtx *b) { //int ihit = threadIdx.x + blockIdx.x * blockDim.x; for(int ihit = threadIdx.x; ihit < b->nhits; ihit += blockDim.x) b->tube_hit_group_tkfun(ihit); } // tube_hit_group_k /** kernel to find tubes with non-zero hit counts */ __device__ void nz_tubes_ktblock(BunchCtx *b) { //int itube = threadIdx.x + blockIdx.x * blockDim.x; GlobalCtx *g = &global_ctx_g; for(int itube = threadIdx.x; itube < g->ntubes; itube += blockDim.x) b->nz_tubes_tkfun(itube); } // nz_tubes /** a kernel to find triplets */ __device__ void find_triplets_ktblock(BunchCtx *b) { //int ihit = threadIdx.x + blockIdx.x * blockDim.x; for(int ihit = threadIdx.x; ihit < b->npivot_hits; ihit += blockDim.x) b->find_triplets_tkfun(ihit); } // find_triplets_k /** a kernel to find half-skewelets */ __device__ void find_half_skewlets_ktblock(BunchCtx *b) { //int ihit = threadIdx.x + blockIdx.x * blockDim.x; for(int ihit = threadIdx.x; ihit < b->nskewed_hits; ihit += blockDim.x) b->find_half_skewlets_tkfun(ihit); } // find_half_skewlets_k /** a kernel to combine half-skewlets without poca test */ __device__ void combine_half_skewlets_ktblock(BunchCtx *b, int irange) { // int i1half = blockIdx.x * blockDim.x; // int j1half = blockIdx.y * blockDim.y; // int lihalf = threadIdx.x, ljhalf = threadIdx.y; // int ihalf = lihalf + i1half; // int jhalf = ljhalf + j1half; // using TBX x 1 arrangement for(int jhalf = 0; jhalf < b->nhalf_skewlets[irange + 1]; jhalf++) for(int ihalf = threadIdx.x; ihalf < b->nhalf_skewlets[irange]; ihalf += blockDim.x) b->combine_half_skewlets_tkfun(irange, ihalf, jhalf); } // combine_half_skewlets_k /** separate kernel for poca test on half-skewlet pairs; separating it from combine_half_skewlets_k reduces branch divergence and improves performance of finding skewlets */ __device__ void poca_test_ktblock(BunchCtx *b, int irange) { //int ipair = threadIdx.x + blockIdx.x * blockDim.x; for(int ipair = threadIdx.x; ipair < b->npre_poca_pairs[irange]; ipair += blockDim.x) b->poca_test_tkfun(irange, ipair, false); } // poca_test_k /** computes the circles for the tracks found */ __device__ void compute_circles_ktblock (layptr cand_tracks, int ncand_tracks) { //int itrack = threadIdx.x + blockIdx.x * blockDim.x; for(int itrack = threadIdx.x; itrack < ncand_tracks; itrack += blockDim.x) compute_circles_tkfun(cand_tracks, ncand_tracks, itrack); } // compute_circles_k /** evaluates tracks against tubes using shared memory */ __device__ void eval_tracks_tubes_shared_ktblock(BunchCtx *b) { //int itrack = threadIdx.x + blockIdx.x * blockDim.x; // round the number of "tracks" to the multiple of number of blocks int nthreads = divup(b->ncand_tracks, blockDim.x) * blockDim.x; for(int itrack = threadIdx.x; itrack < nthreads; itrack += blockDim.x) b->eval_tracks_tubes_shared_tkfun(itrack); } // eval_tracks_tubes_shared_k /** evaluates tracks against tubes using "rasterization" */ __device__ void eval_tracks_tubes_raster_ktblock(BunchCtx *b) { //int itrack = threadIdx.x + blockIdx.x * blockDim.x; int nthreads = divup(b->ncand_tracks, blockDim.x) * blockDim.x; for(int itrack = threadIdx.x; itrack < nthreads; itrack += blockDim.x) b->eval_tracks_tubes_raster_tkfun(itrack); } // eval_tracks_tubes_raster_k /** evaluates tracks against all skewlets */ __device__ void eval_tracks_skewlets_all_ktblock(BunchCtx *b) { //int itrack = threadIdx.x + blockIdx.x * blockDim.x; for(int itrack = threadIdx.x; itrack < b->ncand_tracks; itrack += blockDim.x) b->eval_tracks_skewlets_all_tkfun(itrack); } // eval_tracks_skewlets_all_ktblock /** evaluates tracks against bins skewlets */ __device__ void eval_tracks_skewlets_bins_ktblock(BunchCtx *b) { //int itrack = threadIdx.x + blockIdx.x * blockDim.x; for(int itrack = threadIdx.x; itrack < b->ncand_tracks; itrack += blockDim.x) b->eval_tracks_skewlets_bins_tkfun(itrack); } // eval_tracks_skewlets_bins_ktblock /** this kernel combines two layers of triplets */ __device__ void combine_triplets_ktblock (BunchCtx *b, int inner_id, int outer_id, float max_phi) { //int tinner = threadIdx.x + blockDim.x * blockIdx.x; //int touter = threadIdx.y + blockDim.y * blockIdx.y; for(int touter = 0; touter < b->ntriplets[outer_id]; touter++) for(int tinner = threadIdx.x; tinner < b->ntriplets[inner_id]; tinner += blockDim.x) b->combine_triplets_tkfun(inner_id, outer_id, max_phi, tinner, touter); } // combine_triplets_k // (non-kernel) methods for reconstruction with dynamic parallelism __device__ void BunchCtx::reconstruct_tblock(void) { // do reconstruction find_tube_hits_tblock(); find_half_skewlets_tblock(); find_triplets_tblock(); find_cand_tracks_tblock(); find_skewlets_tblock(); eval_cand_tracks_tblock(); } // reconstruct_dyn __device__ void BunchCtx::find_tube_hits_tblock(void) { const GlobalCtx *g = &global_ctx_g; // find pivot tube hits tblock_copy_if(pivot_hits, &npivot_hits, hits, nhits, hit_is_pivot(g->d_tubes)); // find skewed tube hits tblock_copy_if(skewed_hits, &nskewed_hits, hits, nhits, hit_is_skewed(g->d_tubes)); __syncthreads(); // TODO: rewrite this as well #if 0 //#if !BUNCH_HOST int bs = 256; // get tube-hit count histogram tube_hit_histo_k<<>>(this); cucheck_dev(cudaGetLastError()); cucheck_dev(cudaDeviceSynchronize()); // obtain positions from counts using prefix sums (currently sequential) dyn_xpre_sum(tube_hit_starts, tube_hit_counts, g->ntubes + 1, s); // remove tubes with zero counts nz_tubes_k<<ntubes, bs), bs, 0, s>>>(this); // group hits by tubes cucheck_dev(cudaMemsetAsync (tube_hit_counts, 0, (g->ntubes + 1) * sizeof(int), s)); cucheck_dev(cudaDeviceSynchronize()); tube_hit_group_k<<>>(this); #endif } // find_tube_hits __device__ void BunchCtx::find_triplets_tblock(void) { const GlobalCtx *g = &global_ctx_g; // reset stuff if(threadIdx.x == 0) memset(ntriplets, 0, g->npivot_ranges * sizeof(int)); __syncthreads(); // find triplets find_triplets_ktblock(this); __syncthreads(); } // find_triplets __device__ void BunchCtx::find_half_skewlets_tblock(void) { // find half-skewlets find_half_skewlets_ktblock(this); __syncthreads(); } // find_half_skewlets __device__ void BunchCtx::find_skewlets_tblock(void) { const GlobalCtx *g = &global_ctx_g; // main skewlet test (all except poca) for(int irange = 0; irange < g->nskewlet_ranges; irange++) { //int ninner = nhalf_skewlets[irange], nouter = nhalf_skewlets[irange + 1]; combine_half_skewlets_ktblock(this, irange); __syncthreads(); } // loop over ranges // poca test for(int irange = 0; irange < g->nskewlet_ranges; irange++) { poca_test_ktblock(this, irange); __syncthreads(); } } // find_skewlets __device__ void BunchCtx::find_cand_tracks_tblock(void) { const GlobalCtx *g = &global_ctx_g; // go through all tracks for(int irange = 0; irange < g->npivot_ranges; irange++) for(int jrange = irange + 1; jrange < g->npivot_ranges; jrange++) { combine_triplets_ktblock(this, irange, jrange, max_phis_g[irange][jrange]); __syncthreads(); } // compute circles for track candidates compute_circles_ktblock(cand_tracks, ncand_tracks); __syncthreads(); } // find_cand_tracks __device__ void BunchCtx::eval_cand_tracks_tblock(void) { const GlobalCtx *g = &global_ctx_g; //int bs = EVAL_TRACKS_BS; switch(g->opts.tube_strategy()) { case TubeTestRaster: eval_tracks_tubes_raster_ktblock(this); break; case TubeTestShared: eval_tracks_tubes_shared_ktblock(this); break; default: assert(0); } __syncthreads(); // skewlets switch(g->opts.skewlet_strategy()) { case SkewletTestAll: eval_tracks_skewlets_all_ktblock(this); break; case SkewletTestBins: eval_tracks_skewlets_bins_ktblock(this); break; default: assert(0); } __syncthreads(); // group all good tracks tblock_copy_if(found_tracks, &nfound_tracks, cand_tracks, ncand_tracks, is_track_good(7, 12, 10, g->d_tubes)); __syncthreads(); } // eval_cand_tracks