/** @file bunch-ctx-dyn.cu.h kernels and methods for track reconstruction with dynamic parallelism */ /** call a specific member function of bunch for all bunches; if the passed function is 0, just wait for all bunches */ static void for_all_bunches (BunchCtx *bunches, cudaStream_t *streams, int nbunches, void (BunchCtx::*f)(cudaStream_t), bool sync = true) { for(int ibunch = 0; ibunch < nbunches; ibunch++) { if(sync) cucheck(cudaStreamSynchronize(streams[ibunch])); if(f) (bunches[ibunch].*f)(streams[ibunch]); } } // for_all_bunches /** call a specific member function of bunch for all bunches, but on a single stream */ static void for_all_bunches (BunchCtx *bunches, cudaStream_t s, int nbunches, void (BunchCtx::*f)(cudaStream_t), bool sync = true) { if(sync) cucheck(cudaStreamSynchronize(s)); if(f) { for(int ibunch = 0; ibunch < nbunches; ibunch++) (bunches[ibunch].*f)(s); } } // for_all_bunches void reconstruct_hstream_fun(GlobalCtx *g) { nvtxRangePushA("with_streams"); BunchCtx *bunches = g->bunches; int nbunches = g->nbunches; cudaStream_t *streams = (cudaStream_t *) malloc(nbunches * sizeof(cudaStream_t)); for(int ibunch = 0; ibunch < nbunches; ibunch++) { cucheck(cudaStreamCreateWithFlags (&streams[ibunch], cudaStreamNonBlocking)); } // for(ibunch) nvtxRangePushA("reconstruct_hstreams"); // do reconstruction for all bunches // "false" means absence of sync _before_, not after (there's no sync after // each call) reco_time_start(); for_all_bunches(bunches, streams, nbunches, &BunchCtx::find_tube_hits_hstream); for_all_bunches(bunches, streams, nbunches, &BunchCtx::find_half_skewlets_hstream); for_all_bunches(bunches, streams, nbunches, &BunchCtx::find_triplets_hstream, false); for_all_bunches(bunches, streams, nbunches, &BunchCtx::find_skewlets_pre_poca_hstream); for_all_bunches(bunches, streams, nbunches, &BunchCtx::find_skewlets_poca_hstream); for_all_bunches(bunches, streams, nbunches, &BunchCtx::find_cand_tracks_hstream, false); for_all_bunches(bunches, streams, nbunches, &BunchCtx::eval_cand_tracks_hstream); cucheck(cudaDeviceSynchronize()); reco_time_end(); nvtxRangePop(); // destroy the streams for(int ibunch = 0; ibunch < nbunches; ibunch++) cucheck(cudaStreamDestroy(streams[ibunch])); free(streams); nvtxRangePop(); } // reconstruct_hstream_fun /** structure to be passed to thread-based reconstruction */ typedef struct { /** global context */ GlobalCtx *g; /** bunches to be processed */ BunchCtx *bunches; /** number of bunches to be processed */ int nbunches; } thread_ctx_t; /** thread function for host stream reconstruction */ void *reconstruct_tfun(void *arg) { thread_ctx_t *ctx = (thread_ctx_t *)arg; GlobalCtx *g = ctx->g; BunchCtx *bunches = ctx->bunches; int nbunches = ctx->nbunches; // device and stream cudaStream_t s; cucheck(cudaSetDevice(g->opts.idevice)); cucheck(cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking)); // do reconstruction with timing double t1 = omp_get_wtime(); // feed many bunches into a single stream #if 0 for_all_bunches(bunches, s, nbunches, &BunchCtx::find_tube_hits_hstream); for_all_bunches(bunches, s, nbunches, &BunchCtx::find_half_skewlets_hstream); for_all_bunches(bunches, s, nbunches, &BunchCtx::find_triplets_hstream); for_all_bunches(bunches, s, nbunches, &BunchCtx::find_skewlets_pre_poca_hstream); for_all_bunches(bunches, s, nbunches, &BunchCtx::find_skewlets_poca_hstream); for_all_bunches(bunches, s, nbunches, &BunchCtx::find_cand_tracks_hstream); for_all_bunches(bunches, s, nbunches, &BunchCtx::eval_cand_tracks_hstream); for_all_bunches(bunches, s, nbunches, 0); #else // process bunches sequentially for(int ibunch = 0; ibunch < nbunches; ibunch++) { bunches[ibunch].find_tube_hits_hstream(s); //-> pivot_hits + skewed_hits cucheck(cudaStreamSynchronize(s)); bunches[ibunch].find_half_skewlets_hstream(s); //nskewed_hits -> nhalf_skewlets //cucheck(cudaStreamSynchronize(s)); bunches[ibunch].find_triplets_hstream(s); //npivot_hits -> ntriplets cucheck(cudaStreamSynchronize(s)); bunches[ibunch].find_skewlets_pre_poca_hstream(s); //h_nhalf_skewlets -> npre_poca_pairs cucheck(cudaStreamSynchronize(s)); bunches[ibunch].find_skewlets_poca_hstream(s); //h_npre_poca_pairs -> nskewlets //cucheck(cudaStreamSynchronize(s)); bunches[ibunch].find_cand_tracks_hstream(s); // h_ntriplets cucheck(cudaStreamSynchronize(s)); bunches[ibunch].eval_cand_tracks_hstream(s); } // for(each bunch) #endif cucheck(cudaStreamSynchronize(s)); double t2 = omp_get_wtime(); cucheck(cudaStreamDestroy(s)); // return time; works on 64-bit systems only double t = t2 - t1; return *(void **)&t; } // reconstruct_tfun // reconstruction with host threads void reconstruct_hthread_fun(GlobalCtx *g) { // TODO: move #threads into an option int nthreads = 32; char* cudaDeviceMaxConnectionsString; #if WITH_DYN_PAR == 1 cudaDeviceMaxConnectionsString = getenv ("CUDA_DEVICE_MAX_CONNECTIONS"); #else cudaDeviceMaxConnectionsString = getenv ("OMP_NUM_THREADS"); #endif if ( 0 != cudaDeviceMaxConnectionsString ) nthreads = atoi(cudaDeviceMaxConnectionsString); printf("Used: %d device connections and threads\n", nthreads); int nbunches_per_thread = divup(g->nbunches, nthreads); // memory for everything pthread_t *threads = (pthread_t *)malloc(nthreads * sizeof(pthread_t)); thread_ctx_t *ctxs = (thread_ctx_t *)malloc(nthreads * sizeof(thread_ctx_t)); // launch all threads for(int ithread = 0; ithread < nthreads; ithread++) { ctxs[ithread].g = g; int start = ithread * nbunches_per_thread; int end = min(g->nbunches, start + nbunches_per_thread); ctxs[ithread].nbunches = end - start; ctxs[ithread].bunches = g->bunches + start; pthread_create(&threads[ithread], 0, &reconstruct_tfun, &ctxs[ithread]); } // wait all threads double t = 0; double mint = 1000.0; double avgt = 0.0; for(int ithread = 0; ithread < nthreads; ithread++) { double tt = 0; pthread_join(threads[ithread], (void **)&tt); t = max(t, tt); mint = min(tt, mint); avgt += tt; } // for(ithread) // free memory free(threads); free(ctxs); // print resulting time printf("total reconstruction time: %.3lf ms\n", avgt*1e3/nthreads ); printf("reconstruction time stats: max=%.3lf ms, min=%.3lf ms, avg=%.3lf ms\n", t * 1e3, mint* 1e3, avgt*1e3/nthreads ); } // reconstruct_hthreads_fun void BunchCtx::sync_back(cudaStream_t s) { cucheck(cudaMemcpyAsync(this, d_this, sizeof(*this), cudaMemcpyDeviceToHost, s)); } // sync_back void BunchCtx::find_tube_hits_hstream(cudaStream_t s) { const GlobalCtx *g = h_global_ctx; // find pivot tube hits //host_copy_if(pivot_hits, &d_this->npivot_hits, hits, nhits, // hit_is_pivot(g->d_tubes), s); host_copy_if(pivot_hits, &d_this->npivot_hits, hits, nhits, hit_is_pivot(g->d_tubes), s); // find skewed tube hits host_copy_if(skewed_hits, &d_this->nskewed_hits, hits, nhits, hit_is_skewed(g->d_tubes), s); sync_back(s); } // find_tube_hits void BunchCtx::find_triplets_hstream(cudaStream_t s) { const GlobalCtx *g = h_global_ctx; // find triplets size_t ntriplets_sz = g->npivot_ranges * sizeof(int); cucheck(cudaMemsetAsync(ntriplets, 0, ntriplets_sz, s)); if(npivot_hits > 0) { int bs = 256; find_triplets_k<<>>(d_this); cucheck(cudaGetLastError()); } cucheck(cudaMemcpyAsync(h_ntriplets, ntriplets, ntriplets_sz, cudaMemcpyDeviceToHost, s)); } // find_triplets void BunchCtx::find_half_skewlets_hstream(cudaStream_t s) { // find half-skewlets const GlobalCtx *g = h_global_ctx; size_t nhalf_skewlets_sz = g->nskewed_ranges * sizeof(int); int bs = 256; if(nskewed_hits > 0) { find_half_skewlets_k<<>>(d_this); cucheck(cudaGetLastError()); } cucheck(cudaMemcpyAsync(h_nhalf_skewlets, nhalf_skewlets, nhalf_skewlets_sz, cudaMemcpyDeviceToHost, s)); } // find_half_skewlets void BunchCtx::find_skewlets_pre_poca_hstream(cudaStream_t s) { const GlobalCtx *g = h_global_ctx; // main skewlet test (all except poca) for(int irange = 0; irange < g->nskewlet_ranges; irange++) { int ninner = h_nhalf_skewlets[irange]; int nouter = h_nhalf_skewlets[irange + 1]; if(ninner == 0 || nouter == 0) continue; // main test dim3 bs(SKEWLETS_BSX, SKEWLETS_BSY); dim3 grid(divup(ninner, bs.x), divup(nouter, bs.y)); combine_half_skewlets_k<<>>(d_this, irange); cucheck(cudaGetLastError()); } // loop over ranges size_t npre_poca_pairs_sz = g->nskewlet_ranges * sizeof(int); cucheck(cudaMemcpyAsync (h_npre_poca_pairs, npre_poca_pairs, npre_poca_pairs_sz, cudaMemcpyDeviceToHost, s)); } // find_skewlets_pre_poca_hstream void BunchCtx::find_skewlets_poca_hstream(cudaStream_t s) { const GlobalCtx *g = h_global_ctx; // poca test for(int irange = 0; irange < g->nskewlet_ranges; irange++) { int npairs = h_npre_poca_pairs[irange]; if(npairs == 0) continue; int bs = 128; poca_test_k<<>>(d_this, irange); cucheck(cudaGetLastError()); } size_t nskewlets_sz = g->nskewlet_ranges * sizeof(int); cucheck(cudaMemcpyAsync(h_nskewlets, nskewlets, nskewlets_sz, cudaMemcpyDeviceToHost, s)); } // find_skewlets_poca_hstream void BunchCtx::find_cand_tracks_hstream(cudaStream_t s) { const GlobalCtx *g = h_global_ctx; // go through all tracks for(int irange = 0; irange < g->npivot_ranges; irange++) for(int jrange = irange + 1; jrange < g->npivot_ranges; jrange++) { int ni = h_ntriplets[irange], nj = h_ntriplets[jrange]; if(ni == 0 || nj == 0) continue; int bsi = 64, bsj = 4; dim3 bs(bsi, bsj), grid(divup(ni, bsi), divup(nj, bsj)); combine_triplets_k<<>> (d_this, irange, jrange, h_max_phis_g[irange][jrange]); cucheck(cudaGetLastError()); } sync_back(s); } // find_cand_tracks_hstream void BunchCtx::compute_track_circles(cudaStream_t s) { int bs = 256; if(ncand_tracks > 0) { compute_circles_k<<>> (cand_tracks, ncand_tracks); cucheck(cudaGetLastError()); } } // compute_track_circles void BunchCtx::eval_cand_tracks_hstream(cudaStream_t s) { const GlobalCtx *g = h_global_ctx; int bs = 0; if(ncand_tracks > 0) { bs = 256; compute_circles_k<<>> (cand_tracks, ncand_tracks); cucheck(cudaGetLastError()); bs = EVAL_TRACKS_BS; switch(g->opts.tube_strategy()) { case TubeTestRaster: eval_tracks_tubes_raster_k<<>>(d_this); break; case TubeTestShared: eval_tracks_tubes_shared_k<<>>(d_this); break; default: assert(0); } cucheck(cudaGetLastError()); eval_tracks_skewlets_k<<>>(d_this); cucheck(cudaGetLastError()); } // group all good tracks host_copy_if(found_tracks, &d_this->nfound_tracks, cand_tracks, ncand_tracks, is_track_good(7, 12, 10, g->d_tubes), s); sync_back(s); } // eval_cand_tracks