/** @file bunch-ctx-host.cu host-only methods of BunchCtx class, moved to separate file for faster compilation */ #include #include #include #include #include #include #include #include #include #include #include using namespace std; #include "bunch-ctx.h" #include "global-ctx.h" #include "data.h" #include "dynthrust.h" #include "options.h" #include "sector-layer.h" #include "triplet-finder.h" #include "util.h" #include "vec.h" //#include "data.cu.h" //#include "sector-layer.cu.h" //#include "track.cu.h" //#include "triplet.cu.h" #include "bunch-ctx-internal.h" /** host-side angle constants */ float h_max_phis_g[3][3] = MAX_PHIS; /** host-side angle constants */ float h_min_cos_phis_g[3][3] = MIN_COS_PHIS; // same constants in GPU constant memory, declared in bunch-ctx.cu extern __constant__ float min_cos_phis_g[3][3]; // host-only functions void BunchCtx::initialize(void) { void *addr; cucheck(cudaGetSymbolAddress(&addr, min_cos_phis_g)); cucheck(cudaMemcpy(addr, &h_min_cos_phis_g[0][0], sizeof(h_min_cos_phis_g), cudaMemcpyHostToDevice)); // get attributes of some of the bunch functions cudaFuncAttributes attrs; cucheck(cudaFuncGetAttributes(&attrs, eval_tracks_skewlets_all_k)); printf("eval_tracks_skewlets_k: %zd B shared\n", attrs.sharedSizeBytes); } // initialize BunchCtx::BunchCtx (GlobalCtx *global_ctx, BunchCtx *d_this, int bunch_id, float tstart, float dtcore) { memset(this, 0, sizeof(*this)); this->h_global_ctx = global_ctx; this->d_global_ctx = global_ctx->d_this; this->d_this = d_this; this->bunch_id = bunch_id; this->tstart = tstart; tend_core = tstart + dtcore; tend = tend_core + DRIFT_TIME_PEPS; } // BunchCtx void BunchCtx::alloc_memory(void) { cucheck(cudaMalloc((void **)&hits, h_global_ctx->max_nhits * sizeof(Hit))); cucheck(cudaMallocHost((void **)&h_hits, h_global_ctx->max_nhits * sizeof(Hit))); // a shortcut GlobalCtx *g = h_global_ctx; // memory for grouping hits by tubes size_t itubes_sz = (g->ntubes + 1) * sizeof(int); size_t scratch_sz = SCAN_BS * sizeof(int); // cucheck(cudaMalloc((void **)&pivot_hits, // g->max_ntriplets * sizeof(Hit))); // cucheck(cudaMalloc((void **)&skewed_hits, // g->max_nskewlets * sizeof(Hit))); cucheck(cuda_alloc(&pivot_hits, g->max_ntriplets)); cucheck(cuda_alloc(&skewed_hits, g->max_nskewlets)); //cucheck(cudaMalloc((void **)&tube_hits, g->max_nhits * sizeof(Hit))); cucheck(cuda_alloc(&tube_hits, g->max_nhits)); cucheck(cudaMalloc((void **)&tube_hit_starts, itubes_sz)); cucheck(cudaMalloc((void **)&tube_hit_counts, itubes_sz)); //cucheck(cudaMallocHost((void **)&h_tube_hits, g->max_nhits * sizeof(Hit))); cucheck(host_alloc(&h_tube_hits, g->max_nhits)); cucheck(cudaMalloc((void **)&tube_hit_scratch, scratch_sz)); cucheck(cudaMallocHost((void **)&h_tube_hit_starts, itubes_sz)); cucheck(cudaMallocHost((void **)&h_tube_hit_counts, itubes_sz)); cucheck(cudaMalloc((void **)&itubes, itubes_sz)); cucheck(cudaMallocHost((void **)&h_itubes, itubes_sz)); cucheck(cudaMemset(tube_hit_counts, 0, itubes_sz)); cucheck(cudaMemset(tube_hit_starts, 0, itubes_sz)); cucheck(cudaMemset(itubes, 0, itubes_sz)); cucheck(cudaMemset(tube_hit_scratch, 0, scratch_sz)); memset(h_itubes, 0, itubes_sz); // memory for triplets size_t triplet_sz = g->max_ntriplets * g->npivot_ranges * sizeof(Triplet); size_t ntriplets_sz = g->npivot_ranges * sizeof(int); //cucheck(cudaMalloc((void **)&triplets, triplet_sz)); cucheck(cuda_alloc(&triplets, g->max_ntriplets * g->npivot_ranges)); cucheck(cudaMalloc((void **)&ntriplets, ntriplets_sz)); cucheck(cudaMallocHost((void **)&h_ntriplets, ntriplets_sz)); memset(h_ntriplets, 0, ntriplets_sz); // memory for candidate tracks; zero it out // cucheck(cudaMalloc((void **)&cand_tracks, // g->max_ntracks * sizeof(Track))); cucheck(cuda_alloc(&cand_tracks, g->max_ntracks)); //cucheck(cudaMemset(cand_tracks, 0, g->max_ntracks * sizeof(Track))); // memory for half-skewlets size_t half_skewlet_sz = g->nskewed_ranges * g->max_nskewlets * sizeof(HalfSkewlet); size_t counter_sz = g->nskewed_ranges * sizeof(int); //cucheck(cudaMalloc((void **)&half_skewlets, half_skewlet_sz)); cucheck(cuda_alloc(&half_skewlets, g->nskewed_ranges * g->max_nskewlets)); cucheck(cudaMalloc((void **)&nhalf_skewlets, counter_sz)); cucheck(cudaMallocHost((void **)&h_nhalf_skewlets, counter_sz)); cucheck(cudaMemset(nhalf_skewlets, 0, counter_sz)); memset(h_nhalf_skewlets, 0, counter_sz); // memory for skewlets size_t skewlet_sz = g->nskewlet_ranges * g->max_nskewlets * sizeof(Skewlet); size_t skewlet_comp_sz = g->nskewlet_ranges * g->max_nskewlets * sizeof(float); size_t pre_poca_sz = g->nskewlet_ranges * g->max_nskewlets * 2 * sizeof(int); counter_sz = g->nskewlet_ranges * sizeof(int); //cucheck(cudaMalloc((void **)&skewlets, skewlet_sz)); cucheck(cuda_alloc(&skewlets, g->nskewlet_ranges * g->max_nskewlets)); cucheck(cudaMalloc((void **)&poca_xs, skewlet_comp_sz)); cucheck(cudaMalloc((void **)&poca_ys, skewlet_comp_sz)); cucheck(cudaMalloc((void **)&min_t0s, skewlet_comp_sz)); cucheck(cudaMalloc((void **)&nskewlets, counter_sz)); cucheck(cudaMallocHost((void **)&h_nskewlets, counter_sz)); cucheck(cudaMalloc((void **)&pre_poca_pairs, pre_poca_sz)); cucheck(cudaMalloc((void **)&npre_poca_pairs, counter_sz)); cucheck(cudaMallocHost((void **)&h_npre_poca_pairs, counter_sz)); cucheck(cudaMemset(nskewlets, 0, counter_sz)); memset(h_nskewlets, 0, counter_sz); cucheck(cudaMemset(npre_poca_pairs, 0, counter_sz)); memset(h_npre_poca_pairs, 0, counter_sz); // memory for skewlet bins size_t skewlet_bin_sz = g->nskewlet_ranges * g->nsectors * NSL_SKEWLET_BINS * sizeof(int); cucheck(cudaMalloc((void **)&skewlet_bins, skewlet_bin_sz)); cucheck(cudaMemset(skewlet_bins, -1, skewlet_bin_sz)); // memory for tracks cucheck(cudaMalloc((void **)&found_tracks, g->max_ntracks * sizeof(Track))); cucheck(cudaMallocHost((void **)&h_found_tracks, g->max_ntracks * sizeof(Track))); //cucheck(cudaMallocHost((void **)&h_found_tracks, // g->max_ntracks * sizeof(Track))); } // alloc_memory void BunchCtx::bunch_hits_host(void) { // find upper and lower bounds for hits GlobalCtx *g = h_global_ctx; Hit h0(tstart, 0), h1(tend, 0); Hit *pstart = lower_bound(g->hits, g->hits + g->nhits, h0, hit_is_less()); Hit *pend = upper_bound(g->hits, g->hits + g->nhits, h1, hit_is_less()); copy(pstart, pend, h_hits); nhits = pend - pstart; } // bunch_hits_host void BunchCtx::find_tube_hits_host(void) { GlobalCtx *g = h_global_ctx; // count hits in each tube memset(h_tube_hit_counts, 0, (g->ntubes + 1) * sizeof(int)); for(int ihit = 0; ihit < nhits; ihit++) h_tube_hit_counts[h_hits[ihit].tube_id]++; // compute starts memset(h_tube_hit_starts, 0, (g->ntubes + 1) * sizeof(int)); for(int itube = 1; itube < g->ntubes + 1; itube++) { h_tube_hit_starts[itube] = h_tube_hit_starts[itube - 1] + h_tube_hit_counts[itube - 1]; } // group hits memset(h_tube_hit_counts, 0, (g->ntubes + 1) * sizeof(int)); for(int ihit = 0; ihit < nhits; ihit++) { int pos = h_tube_hit_counts[h_hits[ihit].tube_id]++; pos += h_tube_hit_starts[h_hits[ihit].tube_id]; h_tube_hits[pos] = h_hits[ihit]; } // find non-zero tubes nitubes = 0; for(int itube = 1; itube < g->ntubes; itube++) if(h_tube_hit_starts[itube] < h_tube_hit_starts[itube + 1]) h_itubes[nitubes++] = itube; } // find_tube_hits_host void BunchCtx::sync_arrays_to_device(void) { GlobalCtx *g = h_global_ctx; size_t itubes_sz = (g->ntubes + 1) * sizeof(int); cucheck(cudaMemcpy(hits, h_hits, nhits * sizeof(Hit), cudaMemcpyHostToDevice)); if(!g->opts.group_on_device()) { //cucheck(cudaMemcpy(tube_hits, h_tube_hits, nhits * sizeof(Hit), // cudaMemcpyHostToDevice)); cucheck(cuda_copy(tube_hits, h_tube_hits, nhits, cudaMemcpyHostToDevice)); cucheck(cudaMemcpy(tube_hit_starts, h_tube_hit_starts, itubes_sz, cudaMemcpyHostToDevice)); cucheck(cudaMemcpy(tube_hit_counts, h_tube_hit_counts, itubes_sz, cudaMemcpyHostToDevice)); } cucheck(cudaMemcpy(itubes, h_itubes, itubes_sz, cudaMemcpyHostToDevice)); } // sync_arrays_to_device void BunchCtx::free_memory(void) { // TODO: free allocated memory } // free_memory void BunchCtx::collect_tracks(vector &tracks) { // copy found tracks back from GPU cucheck(cudaMemcpy (h_found_tracks, found_tracks, nfound_tracks * sizeof(Track), cudaMemcpyDeviceToHost)); for(int itrack = 0; itrack < nfound_tracks; itrack++) tracks.push_back(h_found_tracks[itrack]); } // collect_tracks /** call a specific member function of bunch for all bunches; if the passed function is 0, just wait for all bunches */ static void for_all_bunches (BunchCtx *bunches, cudaStream_t *streams, int nbunches, void (BunchCtx::*f)(cudaStream_t), bool sync = true) { for(int ibunch = 0; ibunch < nbunches; ibunch++) { if(sync) cucheck(cudaStreamSynchronize(streams[ibunch])); if(f) (bunches[ibunch].*f)(streams[ibunch]); } } // for_all_bunches #if 0 /** call a specific member function of bunch for all bunches, but on a single stream */ static void for_all_bunches (BunchCtx *bunches, cudaStream_t s, int nbunches, void (BunchCtx::*f)(cudaStream_t), bool sync = true) { if(sync) cucheck(cudaStreamSynchronize(s)); if(f) { for(int ibunch = 0; ibunch < nbunches; ibunch++) (bunches[ibunch].*f)(s); } } // for_all_bunches #endif void reconstruct_hstream_fun(GlobalCtx *g) { nvtxRangePushA("with_streams"); BunchCtx *bunches = g->bunches; int nbunches = g->nbunches; cudaStream_t *streams = (cudaStream_t *) malloc(nbunches * sizeof(cudaStream_t)); for(int ibunch = 0; ibunch < nbunches; ibunch++) { cucheck(cudaStreamCreateWithFlags (&streams[ibunch], cudaStreamNonBlocking)); } // for(ibunch) nvtxRangePushA("reconstruct_hstreams"); // do reconstruction for all bunches // "false" means absence of sync _before_, not after (there's no sync after // each call) reco_time_start(); for_all_bunches(bunches, streams, nbunches, &BunchCtx::find_tube_hits_hstream); for_all_bunches(bunches, streams, nbunches, &BunchCtx::find_half_skewlets_hstream); for_all_bunches(bunches, streams, nbunches, &BunchCtx::find_triplets_hstream, false); for_all_bunches(bunches, streams, nbunches, &BunchCtx::find_skewlets_pre_poca_hstream); for_all_bunches(bunches, streams, nbunches, &BunchCtx::find_skewlets_poca_hstream); for_all_bunches(bunches, streams, nbunches, &BunchCtx::find_cand_tracks_hstream, false); for_all_bunches(bunches, streams, nbunches, &BunchCtx::eval_cand_tracks_hstream); cucheck(cudaDeviceSynchronize()); reco_time_end(); nvtxRangePop(); // destroy the streams for(int ibunch = 0; ibunch < nbunches; ibunch++) cucheck(cudaStreamDestroy(streams[ibunch])); free(streams); nvtxRangePop(); } // reconstruct_hstream_fun /** structure to be passed to thread-based reconstruction */ typedef struct { /** global context */ GlobalCtx *g; /** bunches to be processed */ BunchCtx *bunches; /** number of bunches to be processed */ int nbunches; } thread_ctx_t; /** thread function for host stream reconstruction */ void *reconstruct_tfun(void *arg) { thread_ctx_t *ctx = (thread_ctx_t *)arg; GlobalCtx *g = ctx->g; BunchCtx *bunches = ctx->bunches; int nbunches = ctx->nbunches; // device and stream cudaStream_t s; cucheck(cudaSetDevice(g->opts.idevice)); cucheck(cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking)); // do reconstruction with timing double t1 = omp_get_wtime(); // feed many bunches into a single stream #if 0 for_all_bunches(bunches, s, nbunches, &BunchCtx::find_tube_hits_hstream); for_all_bunches(bunches, s, nbunches, &BunchCtx::find_half_skewlets_hstream); for_all_bunches(bunches, s, nbunches, &BunchCtx::find_triplets_hstream); for_all_bunches(bunches, s, nbunches, &BunchCtx::find_skewlets_pre_poca_hstream); for_all_bunches(bunches, s, nbunches, &BunchCtx::find_skewlets_poca_hstream); for_all_bunches(bunches, s, nbunches, &BunchCtx::find_cand_tracks_hstream); for_all_bunches(bunches, s, nbunches, &BunchCtx::eval_cand_tracks_hstream); for_all_bunches(bunches, s, nbunches, 0); #else // process bunches sequentially for(int ibunch = 0; ibunch < nbunches; ibunch++) { bunches[ibunch].find_tube_hits_hstream(s); //-> pivot_hits + skewed_hits cucheck(cudaStreamSynchronize(s)); bunches[ibunch].find_half_skewlets_hstream(s); //nskewed_hits -> nhalf_skewlets //cucheck(cudaStreamSynchronize(s)); bunches[ibunch].find_triplets_hstream(s); //npivot_hits -> ntriplets cucheck(cudaStreamSynchronize(s)); bunches[ibunch].find_skewlets_pre_poca_hstream(s); //h_nhalf_skewlets -> npre_poca_pairs cucheck(cudaStreamSynchronize(s)); bunches[ibunch].find_skewlets_poca_hstream(s); //h_npre_poca_pairs -> nskewlets //cucheck(cudaStreamSynchronize(s)); bunches[ibunch].find_cand_tracks_hstream(s); // h_ntriplets cucheck(cudaStreamSynchronize(s)); bunches[ibunch].eval_cand_tracks_hstream(s); } // for(each bunch) #endif cucheck(cudaStreamSynchronize(s)); double t2 = omp_get_wtime(); cucheck(cudaStreamDestroy(s)); // return time; works on 64-bit systems only double t = t2 - t1; return *(void **)&t; } // reconstruct_tfun // reconstruction with host threads void reconstruct_hthread_fun(GlobalCtx *g) { // TODO: move #threads into an option int nthreads = 32; char* cudaDeviceMaxConnectionsString; #if WITH_DYN_PAR == 1 cudaDeviceMaxConnectionsString = getenv ("CUDA_DEVICE_MAX_CONNECTIONS"); #else cudaDeviceMaxConnectionsString = getenv ("OMP_NUM_THREADS"); #endif if ( 0 != cudaDeviceMaxConnectionsString ) nthreads = atoi(cudaDeviceMaxConnectionsString); printf("Used: %d device connections and threads\n", nthreads); int nbunches_per_thread = divup(g->nbunches, nthreads); // memory for everything pthread_t *threads = (pthread_t *)malloc(nthreads * sizeof(pthread_t)); thread_ctx_t *ctxs = (thread_ctx_t *)malloc(nthreads * sizeof(thread_ctx_t)); // launch all threads for(int ithread = 0; ithread < nthreads; ithread++) { ctxs[ithread].g = g; int start = ithread * nbunches_per_thread; int end = min(g->nbunches, start + nbunches_per_thread); ctxs[ithread].nbunches = end - start; ctxs[ithread].bunches = g->bunches + start; pthread_create(&threads[ithread], 0, &reconstruct_tfun, &ctxs[ithread]); } // wait all threads double t = 0; double mint = 1000.0; double avgt = 0.0; for(int ithread = 0; ithread < nthreads; ithread++) { double tt = 0; pthread_join(threads[ithread], (void **)&tt); t = max(t, tt); mint = min(tt, mint); avgt += tt; } // for(ithread) // free memory free(threads); free(ctxs); // print resulting time printf("total reconstruction time: %.3lf ms\n", avgt*1e3/nthreads ); printf("reconstruction time stats: max=%.3lf ms, min=%.3lf ms, avg=%.3lf ms\n", t * 1e3, mint* 1e3, avgt*1e3/nthreads ); } // reconstruct_hthreads_fun void BunchCtx::sync_back(cudaStream_t s) { cucheck(cudaMemcpyAsync(this, d_this, sizeof(*this), cudaMemcpyDeviceToHost, s)); } // sync_back void BunchCtx::find_tube_hits_hstream(cudaStream_t s) { const GlobalCtx *g = h_global_ctx; // find pivot tube hits //host_copy_if(pivot_hits, &d_this->npivot_hits, hits, nhits, // hit_is_pivot(g->d_tubes), s); host_copy_if(pivot_hits, &d_this->npivot_hits, hits, nhits, hit_is_pivot(g->d_tubes), s); // find skewed tube hits host_copy_if(skewed_hits, &d_this->nskewed_hits, hits, nhits, hit_is_skewed(g->d_tubes), s); sync_back(s); } // find_tube_hits void BunchCtx::find_triplets_hstream(cudaStream_t s) { const GlobalCtx *g = h_global_ctx; // find triplets size_t ntriplets_sz = g->npivot_ranges * sizeof(int); cucheck(cudaMemsetAsync(ntriplets, 0, ntriplets_sz, s)); if(npivot_hits > 0) { int bs = 256; find_triplets_k<<>>(d_this); cucheck(cudaGetLastError()); } cucheck(cudaMemcpyAsync(h_ntriplets, ntriplets, ntriplets_sz, cudaMemcpyDeviceToHost, s)); } // find_triplets void BunchCtx::find_half_skewlets_hstream(cudaStream_t s) { // find half-skewlets const GlobalCtx *g = h_global_ctx; size_t nhalf_skewlets_sz = g->nskewed_ranges * sizeof(int); int bs = 256; if(nskewed_hits > 0) { find_half_skewlets_k<<>>(d_this); cucheck(cudaGetLastError()); } cucheck(cudaMemcpyAsync(h_nhalf_skewlets, nhalf_skewlets, nhalf_skewlets_sz, cudaMemcpyDeviceToHost, s)); } // find_half_skewlets void BunchCtx::find_skewlets_pre_poca_hstream(cudaStream_t s) { const GlobalCtx *g = h_global_ctx; // main skewlet test (all except poca) for(int irange = 0; irange < g->nskewlet_ranges; irange++) { int ninner = h_nhalf_skewlets[irange]; int nouter = h_nhalf_skewlets[irange + 1]; if(ninner == 0 || nouter == 0) continue; // main test dim3 bs(SKEWLETS_BSX, SKEWLETS_BSY); dim3 grid(divup(ninner, bs.x), divup(nouter, bs.y)); combine_half_skewlets_k<<>>(d_this, irange); cucheck(cudaGetLastError()); } // loop over ranges size_t npre_poca_pairs_sz = g->nskewlet_ranges * sizeof(int); cucheck(cudaMemcpyAsync (h_npre_poca_pairs, npre_poca_pairs, npre_poca_pairs_sz, cudaMemcpyDeviceToHost, s)); } // find_skewlets_pre_poca_hstream void BunchCtx::find_skewlets_poca_hstream(cudaStream_t s) { const GlobalCtx *g = h_global_ctx; // poca test for(int irange = 0; irange < g->nskewlet_ranges; irange++) { int npairs = h_npre_poca_pairs[irange]; if(npairs == 0) continue; int bs = 128; poca_test_k<<>>(d_this, irange); cucheck(cudaGetLastError()); } size_t nskewlets_sz = g->nskewlet_ranges * sizeof(int); cucheck(cudaMemcpyAsync(h_nskewlets, nskewlets, nskewlets_sz, cudaMemcpyDeviceToHost, s)); } // find_skewlets_poca_hstream void BunchCtx::find_cand_tracks_hstream(cudaStream_t s) { const GlobalCtx *g = h_global_ctx; // go through all tracks for(int irange = 0; irange < g->npivot_ranges; irange++) for(int jrange = irange + 1; jrange < g->npivot_ranges; jrange++) { int ni = h_ntriplets[irange], nj = h_ntriplets[jrange]; if(ni == 0 || nj == 0) continue; int bsi = 64, bsj = 4; dim3 bs(bsi, bsj), grid(divup(ni, bsi), divup(nj, bsj)); //combine_triplets_k<<>> // (d_this, irange, jrange, h_max_phis_g[irange][jrange]); combine_triplets_k<<>> (d_this, irange, jrange, h_min_cos_phis_g[irange][jrange]); cucheck(cudaGetLastError()); } sync_back(s); } // find_cand_tracks_hstream void BunchCtx::compute_track_circles(cudaStream_t s) { int bs = 256; if(ncand_tracks > 0) { compute_circles_k<<>> (cand_tracks, ncand_tracks); cucheck(cudaGetLastError()); } } // compute_track_circles void BunchCtx::eval_cand_tracks_hstream(cudaStream_t s) { const GlobalCtx *g = h_global_ctx; int bs = 0; if(ncand_tracks > 0) { bs = 256; compute_circles_k<<>> (cand_tracks, ncand_tracks); cucheck(cudaGetLastError()); bs = EVAL_TRACKS_BS; switch(g->opts.tube_strategy()) { case TubeTestRaster: eval_tracks_tubes_raster_k<<>>(d_this); break; case TubeTestShared: eval_tracks_tubes_shared_k<<>>(d_this); break; default: assert(0); } cucheck(cudaGetLastError()); // skewlets switch(g->opts.skewlet_strategy()) { case SkewletTestAll: eval_tracks_skewlets_all_k<<>>(d_this); break; case SkewletTestBins: eval_tracks_skewlets_bins_k<<>>(d_this); break; default: assert(0); } cucheck(cudaGetLastError()); } // group all good tracks host_copy_if(found_tracks, &d_this->nfound_tracks, cand_tracks, ncand_tracks, is_track_good(7, 12, 10, g->d_tubes), s); sync_back(s); } // eval_cand_tracks