/** @file bunch-ctx-dyn.cu.h kernels and methods for track reconstruction with dynamic parallelism */ // kernels // global reconstruction kernel __global__ void reconstruct_dyn_k(void) { const GlobalCtx *g = &global_ctx_g; int nbunches = g->nbunches; int ithread = threadIdx.x + blockIdx.x * blockDim.x; //if(ithread % 2) // return; //int ibunch = ithread / 2; int ibunch = ithread; if(ibunch >= nbunches) return; BunchCtx *bunch = &g->d_bunches[ibunch]; bunch->reconstruct_dyn(); } // reconstruct_k // (non-kernel) methods for reconstruction with dynamic parallelism __device__ void BunchCtx::reconstruct_dyn(void) { // create stream cudaStream_t s; //s = 0; cucheck_dev(cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking)); // do reconstruction find_tube_hits_dyn(s); find_half_skewlets_dyn(s); find_triplets_dyn(s); find_cand_tracks_dyn(s); find_skewlets_dyn(s); eval_cand_tracks_dyn(s); // device runtime ensures that all previously launched kernels finish before // the host-called kernel finishes cucheck_dev(cudaStreamDestroy(s)); } // reconstruct_dyn __device__ void BunchCtx::find_tube_hits_dyn(cudaStream_t s) { const GlobalCtx *g = &global_ctx_g; // find pivot tube hits dyn_copy_if(pivot_hits, &npivot_hits, hits, nhits, hit_is_pivot(g->d_tubes), s); // find skewed tube hits dyn_copy_if(skewed_hits, &nskewed_hits, hits, nhits, hit_is_skewed(g->d_tubes), s); //#if !BUNCH_HOST if(g->opts.group_on_device()) { int bs = 256; // get tube-hit count histogram tube_hit_histo_k<<>>(this); cucheck_dev(cudaGetLastError()); cucheck_dev(cudaDeviceSynchronize()); // remove tubes with zero counts nz_tubes_k<<ntubes, bs), bs, 0, s>>>(this); // obtain positions from counts using prefix sums // (currently sequential, bottleneck) // TODO: replace with parallel //dyn_xpre_sum(tube_hit_starts, tube_hit_counts, g->ntubes + 1, 0, s); dyn_xpre_sum(tube_hit_starts, tube_hit_counts, g->ntubes, tube_hit_scratch, s); // group hits by tubes cucheck_dev(cudaMemsetAsync (tube_hit_counts, 0, (g->ntubes + 1) * sizeof(int), s)); //cucheck_dev(cudaDeviceSynchronize()); tube_hit_group_k<<>>(this); } //#endif cucheck_dev(cudaGetLastError()); cucheck_dev(cudaDeviceSynchronize()); } // find_tube_hits __device__ void BunchCtx::find_triplets_dyn(cudaStream_t s) { const GlobalCtx *g = &global_ctx_g; // find triplets memset(ntriplets, 0, g->npivot_ranges * sizeof(int)); if(npivot_hits > 0) { int bs = 256; find_triplets_k<<>>(this); cucheck_dev(cudaGetLastError()); cucheck_dev(cudaDeviceSynchronize()); } } // find_triplets __device__ void BunchCtx::find_half_skewlets_dyn(cudaStream_t s) { // find half-skewlets int bs = 256; if(nskewed_hits > 0) { find_half_skewlets_k<<>>(this); cucheck_dev(cudaGetLastError()); cucheck_dev(cudaDeviceSynchronize()); } } // find_half_skewlets __device__ void BunchCtx::find_skewlets_dyn(cudaStream_t s) { const GlobalCtx *g = &global_ctx_g; // main skewlet test (all except poca) for(int irange = 0; irange < g->nskewlet_ranges; irange++) { int ninner = nhalf_skewlets[irange], nouter = nhalf_skewlets[irange + 1]; if(ninner == 0 || nouter == 0) continue; // main test dim3 bs(SKEWLETS_BSX, SKEWLETS_BSY); dim3 grid(divup(ninner, bs.x), divup(nouter, bs.y)); combine_half_skewlets_k<<>>(this, irange); cucheck_dev(cudaGetLastError()); } // loop over ranges cucheck_dev(cudaDeviceSynchronize()); // poca test for(int irange = 0; irange < g->nskewlet_ranges; irange++) { int npairs = npre_poca_pairs[irange]; if(npairs == 0) continue; int bs = 128; poca_test_k<<>>(this, irange); cucheck_dev(cudaGetLastError()); } cucheck_dev(cudaDeviceSynchronize()); // just print the total #skewlets //int total_nskewlets = nskewlets[0] + nskewlets[1] + nskewlets[2]; //printf("bunch %d: total %d skewlets\n", bunch_id, total_nskewlets); } // find_skewlets __device__ void BunchCtx::find_cand_tracks_dyn(cudaStream_t s) { const GlobalCtx *g = &global_ctx_g; // go through all tracks for(int irange = 0; irange < g->npivot_ranges; irange++) for(int jrange = irange + 1; jrange < g->npivot_ranges; jrange++) { int ni = ntriplets[irange], nj = ntriplets[jrange]; if(ni == 0 || nj == 0) continue; int bsi = 64, bsj = 4; dim3 bs(bsi, bsj), grid(divup(ni, bsi), divup(nj, bsj)); // combine_triplets_k<<>> // (this, irange, jrange, max_phis_g[irange][jrange]); // combine_triplets_k<<>> // (this, irange, jrange, cosf(max_phis_g[irange][jrange])); combine_triplets_k<<>> (this, irange, jrange, min_cos_phis_g[irange][jrange]); cucheck_dev(cudaGetLastError()); } cucheck_dev(cudaDeviceSynchronize()); // compute circles for track candidates if(ncand_tracks > 0) { int bs = 256; compute_circles_k<<>> (cand_tracks, ncand_tracks); cucheck_dev(cudaGetLastError()); cucheck_dev(cudaDeviceSynchronize()); } } // find_cand_tracks __device__ void BunchCtx::eval_cand_tracks_dyn(cudaStream_t s) { const GlobalCtx *g = &global_ctx_g; int bs = EVAL_TRACKS_BS; if(ncand_tracks > 0) { switch(g->opts.tube_strategy()) { case TubeTestRaster: eval_tracks_tubes_raster_k<<>>(this); break; case TubeTestShared: eval_tracks_tubes_shared_k<<>>(this); break; default: assert(0); } cucheck_dev(cudaGetLastError()); // skewlets switch(g->opts.skewlet_strategy()) { case SkewletTestAll: eval_tracks_skewlets_all_k<<>>(this); break; case SkewletTestBins: eval_tracks_skewlets_bins_k<<>>(this); break; default: assert(0); } cucheck_dev(cudaGetLastError()); } //printf("bunch %d: %d cand tracks\n", bunch_id, ncand_tracks); // group all good tracks dyn_copy_if(found_tracks, &nfound_tracks, cand_tracks, ncand_tracks, is_track_good(7, 12, 10, g->d_tubes), s); //cucheck_dev(cudaDeviceSynchronize()); } // eval_cand_tracks