/** @file global.cu implementation of global context */ #include #include #include #include #include #include #include #include #include #include #include #include using namespace std; #include "bunch-ctx.h" #include "data.h" #include "global-ctx.h" #include "options.h" #include "sector-layer.h" #include "triplet-finder.h" #include "util.h" #include "vec.h" #include "data.cu.h" #define USE_DYNPAR 1 #define USE_TBLOCK 0 #define USE_HSTREAM 0 /** the size of the dynamic parallelism queue on device */ #define MAX_KERNELS 32768 /** global context on device */ __constant__ GlobalCtx global_ctx_g; /** global sector-layer data on device */ __constant__ SectorLayer sector_layers_g[NSECTOR_LAYERS]; //__device__ GlobalCtx global_ctx_g; GlobalCtx::GlobalCtx(const TripletFinderOptions &opts) { // zero out memset(this, 0, sizeof(*this)); this->opts = opts; // set device and device options #if WITH_DYN_PAR != 0 cucheck(cudaDeviceSetLimit(cudaLimitDevRuntimePendingLaunchCount, MAX_KERNELS)); #endif cucheck(cudaDeviceSetCacheConfig(cudaFuncCachePreferShared)); //cucheck(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1)); // set pivot ranges npivot_ranges = 3; size_t pivot_ranges_sz = npivot_ranges * sizeof(range); cucheck(cudaMallocHost((void **)&pivot_ranges, pivot_ranges_sz)); cucheck(cudaMalloc((void **)&d_pivot_ranges, pivot_ranges_sz)); // ranges are of form [a, b), unlike in PandaRoot, where they are (a, b) pivot_ranges[0] = range::from(105, 215); pivot_ranges[1] = range::from(715, 855); pivot_ranges[2] = range::from(2957, 3175); cucheck(cudaMemcpy(d_pivot_ranges, pivot_ranges, pivot_ranges_sz, cudaMemcpyHostToDevice)); // set skewed ranges nskewed_ranges = 4; nskewlet_ranges = nskewed_ranges - 1; size_t skewed_ranges_sz = nskewed_ranges * sizeof(range); cucheck(cudaMallocHost((void **)&skewed_ranges, skewed_ranges_sz)); cucheck(cudaMalloc((void **)&d_skewed_ranges, skewed_ranges_sz)); skewed_ranges[0] = range::from(1001, 1395); skewed_ranges[1] = range::from(1395, 1813); skewed_ranges[2] = range::from(1813, 2267); skewed_ranges[3] = range::from(2267, 2745); cucheck(cudaMemcpy(d_skewed_ranges, skewed_ranges, skewed_ranges_sz, cudaMemcpyHostToDevice)); // print ranges printf("pivot ranges:\n"); for(int irange = 0; irange < npivot_ranges; irange++) { range r = pivot_ranges[irange]; printf("range %d: [%d, %d)\n", irange, r.start, r.end); } printf("skewed ranges:\n"); for(int irange = 0; irange < nskewed_ranges; irange++) { range r = skewed_ranges[irange]; printf("range %d: [%d, %d)\n", irange, r.start, r.end); } // get address for d_this cucheck(cudaGetSymbolAddress((void **)&d_this, global_ctx_g)); // static initialization of bunch context BunchCtx::initialize(); } // GlobalCtx #if WITH_DYN_PAR // a simple kernel to force intialization // of dynamic parallelism infrastructure __global__ void dyn_init_k(int depth) { if(depth) { cudaStream_t s; cucheck_dev(cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking)); dyn_init_k<<<4, 64, 0, s>>>(depth - 1); cucheck_dev(cudaGetLastError()); cucheck_dev(cudaDeviceSynchronize()); cucheck_dev(cudaStreamDestroy(s)); } } // dyn_init_k() void GlobalCtx::dyn_init(void) { // force initialization of dynamic parallelism dyn_init_k<<<4, 64>>>(1); cucheck(cudaGetLastError()); cucheck(cudaStreamSynchronize(0)); } // dyn_init void GlobalCtx::reconstruct_dyn(void) { dyn_init(); reco_time_start(); int bs = 8; reconstruct_dyn_k<<>>(); cucheck(cudaGetLastError()); cucheck(cudaStreamSynchronize(0)); reco_time_end(); } // reconstruct_dyn #endif void GlobalCtx::reconstruct_tblock(void) { reco_time_start(); int bs = 256; reconstruct_tblock_k<<>>(); cucheck(cudaGetLastError()); cucheck(cudaStreamSynchronize(0)); reco_time_end(); } // reconstruct_tblock void GlobalCtx::reconstruct_hstream(void) { reconstruct_hstream_fun(this); } // reconstruct_hstream void GlobalCtx::reconstruct_hthread(void) { reconstruct_hthread_fun(this); } // reconstruct_hthread vector GlobalCtx::reconstruct(void) { load_tubes(); build_straw_map(); verify_straw_map(); build_sector_layers(); build_skewlet_bin_layers(); load_hits(); set_max_numbers(); alloc_dev_memory(); init_bunches(); // copy this context to device cucheck(cudaMemcpy(d_this, this, sizeof(GlobalCtx), cudaMemcpyHostToDevice)); find_neighbor_tubes(); if(opts.group_on_device()) { sync_bunches_to_device(); bunch_hits(); } else { bunch_hits_host(); sync_bunches_to_device(); } //double tstart = omp_get_wtime(); switch(opts.bunch_strategy()) { case BunchDynamic: #if WITH_DYN_PAR reconstruct_dyn(); #else assert(0); #endif break; case BunchThreadBlock: reconstruct_tblock(); break; case BunchHostStream: reconstruct_hstream(); break; case BunchHostStreamThread: reconstruct_hthread(); break; default: assert(0); } //double tend = omp_get_wtime(); //printf("total reconstruction time: %.3lf ms\n", (tend - tstart) * 1e3); // collect tracks vector vtracks; collect_tracks(vtracks); return vtracks; } // reconstruct void GlobalCtx::load_tubes(void) { Tube *tmp_tubes = read_tubes(opts.tubes_path, &ntubes); cucheck(host_alloc(&tubes, ntubes)); for(int itube = 0; itube < ntubes; itube++) tubes[itube] = tmp_tubes[itube]; // TODO: free tmp_tubes } // load_tubes void GlobalCtx::build_straw_map(void) { // find number of rows and sectors nrows = nsectors = 0; for(int itube = 1; itube < ntubes; itube++) { nrows = max(nrows, tubes[itube].row); nsectors = max(nsectors, tubes[itube].sector); } nrows++; nsectors++; //nstraight_rows = nrows - NSKEWED_ROWS; //printf("%d rows, %d straight, %d sectors\n", nrows, nstraight_rows, //nsectors); printf("%d rows, %d sectors\n", nrows, nsectors); // group tubes by rows and sectors vector > > vstraw_map(nrows, vector >(nsectors, vector())); for(int itube = 1; itube < ntubes; itube++) { Tube t = tubes[itube]; vstraw_map[t.row][t.sector].push_back(itube); } // allocate memory for flat storage of the straw map size_t counts_sz = nrows * nsectors * sizeof(int); size_t map_sz = ntubes * sizeof(int); cucheck(cudaMallocHost((void **)&straw_map_starts, counts_sz)); cucheck(cudaMallocHost((void **)&straw_map_counts, counts_sz)); cucheck(cudaMalloc((void **)&d_straw_map_starts, counts_sz)); cucheck(cudaMalloc((void **)&d_straw_map_counts, counts_sz)); cucheck(cudaMallocHost((void **)&straw_map, map_sz)); cucheck(cudaMalloc((void **)&d_straw_map, map_sz)); // flatten the straw map int start = 0; for(int irow = 0; irow < nrows; irow++) for(int isector = 0; isector < nsectors; isector++) { vector straws = vstraw_map[irow][isector]; straw_map_starts[irow * nsectors + isector] = start; straw_map_counts[irow * nsectors + isector] = straws.size(); copy(straws.begin(), straws.end(), straw_map + start); start += straws.size(); } // copy everything to device cucheck(cudaMemcpy(d_straw_map, straw_map, map_sz, cudaMemcpyHostToDevice)); cucheck(cudaMemcpy(d_straw_map_starts, straw_map_starts, counts_sz, cudaMemcpyHostToDevice)); cucheck(cudaMemcpy(d_straw_map_counts, straw_map_counts, counts_sz, cudaMemcpyHostToDevice)); } // build_straw_map void GlobalCtx::verify_straw_map(void) { for(int irow = 0; irow < nrows; irow++) for(int isector = 0; isector < nsectors; isector++) { int *straws = straw_map + straw_map_starts[irow * nsectors + isector]; int nstraws = straw_map_counts[irow * nsectors + isector]; // 1. tubes in the sector-layer for a contiguous range int min_itube = INT_MAX, max_itube = 0; for(int iitube = 0; iitube < nstraws; iitube++) { int itube = straws[iitube]; min_itube = min(itube, min_itube); max_itube = max(itube, max_itube); } // for each tube if(nstraws != max_itube - min_itube + 1) { printf("row %d, sector %d: tubes %d .. %d not in a contiguous range\n", irow, isector, min_itube, max_itube); continue; } // 1.5. next if layer is skewed if(tubes[min_itube].is_skewed()) continue; // 2. distances from a tube to the minimum-numbered tube = difference in // numbers const float eps = 1e-3; Tube min_tube = tubes[min_itube]; bool distances_as_numbers = true; for(int itube = min_itube; itube <= max_itube; itube++) { float expected_dist = abs(itube - min_itube) * TUBE_CENTER_DIST; float real_dist = dist((fvec3)min_tube.pos, (fvec3)tubes[itube].pos); if(fabsf(expected_dist - real_dist) > eps) { distances_as_numbers = false; printf("row %d, sector %d: tube %d does not have correct distance:" " expected %f, real %f\n", irow, isector, itube, (double)expected_dist, (double)real_dist); break; } } if(!distances_as_numbers) continue; // 3. centers of all tubes lie on the same line fvec2 k = normalized(tubes[min_itube + 1].pos.xy() - min_tube.pos.xy()); fvec2 p0 = min_tube.pos.xy(); //bool centers_on_same_line = true; for(int itube = min_itube; itube <= max_itube; itube++) { fvec2 line_pos = p0 + k * (float(itube - min_itube) * TUBE_CENTER_DIST); fvec2 pos = tubes[itube].pos.xy(); if(dist(pos, line_pos) > eps) { //centers_on_same_line = false; printf("row %d, sector %d: tube %d doesn't lie on the same line\n", irow, isector, itube); break; } } } // for each row, sector (sector-layer) printf("sector-layer verification successful\n"); } // verify_straw_map void GlobalCtx::build_sector_layers(void) { // allocate memory for sector-layers on host and device size_t sector_layer_sz = nsectors * nrows * sizeof(SectorLayer); cucheck(cudaMalloc((void **)&d_sector_layers, sector_layer_sz)); cucheck(cudaMallocHost((void **)§or_layers, sector_layer_sz)); // build sector-layers //int istraight_row = 0; for(int irow = 0; irow < nrows; irow++) { //bool sl_added = false; for(int isector = 0; isector < nsectors; isector++) { // find min and max tube no. int isector_layer = irow * nsectors + isector; int *straws = straw_map + straw_map_starts[isector_layer]; int nstraws = straw_map_counts[isector_layer]; int min_straw = INT_MAX, max_straw = 0; for(int istraw = 0; istraw < nstraws; istraw++) { min_straw = min(min_straw, straws[istraw]); max_straw = max(max_straw, straws[istraw]); } // build sector-layer for specified straws SectorLayer sl = SectorLayer::from_tubes (tubes, range::from(min_straw, max_straw + 1)); //if(!sl.is_skewed) { sector_layers[irow * nsectors + isector] = sl; //sl_added = true; //} } // for(isector) //if(sl_added) // istraight_row++; } // for(irow) // transfer sector-layer data to device cucheck(cudaMemcpy(d_sector_layers, sector_layers, sector_layer_sz, cudaMemcpyHostToDevice)); // transfer sector-layer data to constant memory on device void *sector_layers_g_addr = 0; cucheck(cudaGetSymbolAddress(§or_layers_g_addr, sector_layers_g)); cucheck(cudaMemcpy(sector_layers_g_addr, sector_layers, sector_layer_sz, cudaMemcpyHostToDevice)); } // build_sector_layers void GlobalCtx::build_skewlet_bin_layers(void) { // allocate memory for skewlet bin layers size_t sector_layer_sz = nskewlet_ranges * nsectors * sizeof(SectorLayer); cucheck(cudaMalloc((void **)&d_skewlet_bin_layers, sector_layer_sz)); cucheck(cudaMallocHost((void **)&skewlet_bin_layers, sector_layer_sz)); // build skewlet bin layers for(int iskewlet_layer = 0; iskewlet_layer < nskewlet_ranges; iskewlet_layer++) for(int isector = 0; isector < nsectors; isector++) { int ibin_layer = iskewlet_layer * nsectors + isector; int bins_start = NSL_SKEWLET_BINS * ibin_layer; int irow = SKEWED_ROW_START + iskewlet_layer * 2; skewlet_bin_layers[ibin_layer] = SectorLayer::from_skewlet_bins (§or_layers[irow * nsectors + isector], nsectors, tubes, range::from(bins_start, bins_start + NSL_SKEWLET_BINS)); } // copy skewlet bin layers to device cucheck(cudaMemcpy(d_skewlet_bin_layers, skewlet_bin_layers, sector_layer_sz, cudaMemcpyHostToDevice)); } // build_skewlet_bin_layers void GlobalCtx::load_hits(void) { nhits = opts.nhits; hits = read_hits(opts.hits_path, &nhits); printf("read %d tubes and %d hits\n", ntubes, nhits); // mark pivot ranges for(int irange = 0; irange < npivot_ranges; irange++) { range r = pivot_ranges[irange]; for(int itube = r.start; itube < r.end; itube++) tubes[itube].pivot_range_id = irange; } // mark skewed ranges for(int irange = 0; irange < nskewed_ranges; irange++) { range r = skewed_ranges[irange]; for(int itube = r.start; itube < r.end; itube++) tubes[itube].skewed_range_id = irange; } // copy to device //cucheck(cudaMalloc((void **)&d_tubes, ntubes * sizeof(Tube))); cucheck(cuda_alloc(&d_tubes, ntubes)); cucheck(cudaMalloc((void **)&d_hits, nhits * sizeof(Hit))); //cucheck(cudaMemcpy(d_tubes, tubes, ntubes * sizeof(Tube), // cudaMemcpyHostToDevice)); cucheck(cuda_copy(d_tubes, tubes, ntubes, cudaMemcpyHostToDevice)); cucheck(cudaMemcpy(d_hits, hits, nhits * sizeof(Hit), cudaMemcpyHostToDevice)); //cucheck(); printf("copied tubes and hits to device\n"); } // load_data void GlobalCtx::set_max_numbers(void) { float tbunch = opts.tbunch; float tbunch_full = opts.tbunch + DRIFT_TIME_PEPS; // find the maximum hit time; assume hits are pre-sorted float tmax = hits[nhits - 1].t; nbunches = (int)ceil(tmax / tbunch); //printf("nbunches = %d\n", nbunches); max_nhits = (int)ceil(HIT_RATE * tbunch_full); max_ntriplets = (int)ceil(TRIPLET_RATE * tbunch_full); max_nskewlets = (int)ceil(SKEWLET_RATE * tbunch_full); max_ntracks = (int)ceil(TRACK_RATE * tbunch_full); printf("max_nhits = %d, max_ntriplets = %d\n", max_nhits, max_ntriplets); printf("max_nskewlets = %d, max_ntracks = %d\n", max_nskewlets, max_ntracks); } // set_max_numbers void GlobalCtx::alloc_dev_memory(void) { cucheck(cudaMalloc((void **)&d_nneighbors, ntubes * sizeof(int))); cucheck(cudaMemset(d_nneighbors, 0, ntubes * sizeof(int))); size_t pitch; cucheck(cudaMallocPitch((void **)&d_neighbor_tubes, &pitch, ntubes * sizeof(int), MAX_NNEIGHBORS)); neighbor_pitch = pitch / sizeof(int); } // alloc_dev_memory void GlobalCtx::init_bunches(void) { // allocate memory for bunches size_t bunch_sz = nbunches * sizeof(BunchCtx); cucheck(cudaMallocHost((void **)&bunches, bunch_sz)); cucheck(cudaMalloc((void **)&d_bunches, bunch_sz)); // initialize bunches float tbunch = opts.tbunch; //printf("tbunch = %lf\n ns", tbunch); for(int ibunch = 0; ibunch < nbunches; ibunch++) { bunches[ibunch] = BunchCtx (this, d_bunches + ibunch, ibunch, ibunch * tbunch, tbunch); bunches[ibunch].alloc_memory(); } // for(each bunch) } // init_bunches /** @remarks this kernel is called at most once per program execution, and should not be optimized */ __global__ void find_neighbor_tubes_k (int *neighbor_tubes, int *nneighbors, int ntubes, int pitch, layptr tubes) { int itube = threadIdx.x + blockIdx.x * blockDim.x; if(itube <= 0 || itube >= ntubes) return; for(int jtube = 1; jtube < ntubes; jtube++) { if(jtube == itube) continue; if(tubes[itube].has_neighbor(tubes[jtube])) { // add a neighbor tube; might use atomics // if more threads are required int ineighbor = nneighbors[itube]++; if(ineighbor > MAX_NNEIGHBORS - 1) continue; neighbor_tubes[ineighbor * pitch + itube] = jtube; } } // for(jtube) } // find_neighbor_tubes_k void GlobalCtx::find_neighbor_tubes(void) { // find neighbor tubes int bs = 256; find_neighbor_tubes_k<<>> (d_neighbor_tubes, d_nneighbors, ntubes, neighbor_pitch, d_tubes); //cucheck(cudaStreamSynchronize(0)); // print out maximum and minimum number of neighbors thrust::device_ptr dt_nneighbors(d_nneighbors); int max_nneighbors = thrust::reduce(dt_nneighbors + 1, dt_nneighbors + ntubes, -1, thrust::maximum()); int min_nneighbors = thrust::reduce(dt_nneighbors + 1, dt_nneighbors + ntubes, INT_MAX, thrust::minimum()); printf("tubes have from %d to %d neighbors\n", min_nneighbors, max_nneighbors); } // find_neighbor_tubes /** a helper kernel to bunch hits TODO: do this on CPU, so that sorted hit order is preserved */ __global__ void bunch_hits_k(void) { int ihit = threadIdx.x + blockIdx.x * blockDim.x; int nhits = global_ctx_g.nhits; if(ihit >= nhits) return; // get the bunch for the hit Hit *hits = global_ctx_g.d_hits; float tbunch = global_ctx_g.opts.tbunch, t = hits[ihit].t; int ibunch = (int)floorf(t / tbunch); // add the hit to its own bunch BunchCtx *own_bunch = &global_ctx_g.d_bunches[ibunch]; int pos = atomicAdd(&own_bunch->nhits, 1); own_bunch->hits[pos] = hits[ihit]; // check if the hit needs to be added to another bunch as non-core float tstart = tbunch * ibunch; if(t < tstart + DRIFT_TIME_PEPS && ibunch > 0) { BunchCtx *other_bunch = &global_ctx_g.d_bunches[ibunch - 1]; pos = atomicAdd(&other_bunch->nhits, 1); other_bunch->hits[pos] = hits[ihit]; } } // bunch_hits_k void GlobalCtx::bunch_hits(void) { int bs = 128; bunch_hits_k<<>>(); cucheck(cudaGetLastError()); } // bunch_hits void GlobalCtx::bunch_hits_host(void) { for(int ibunch = 0; ibunch < nbunches; ibunch++) { BunchCtx &b = bunches[ibunch]; b.bunch_hits_host(); b.find_tube_hits_host(); } } // bunch_hits_host void GlobalCtx::sync_bunches_to_device(void) { // synchronizes bunches (not the context itself) to device // copy bunch contexts to device size_t bunch_sz = nbunches * sizeof(BunchCtx); cucheck(cudaMemcpy(d_bunches, bunches, bunch_sz, cudaMemcpyHostToDevice)); // copy data in each bunch for(int ibunch = 0; ibunch < nbunches; ibunch++) { bunches[ibunch].sync_arrays_to_device(); } } // sync_to_dev void GlobalCtx::collect_tracks(vector &tracks) { // copy back the bunch contexts cucheck(cudaMemcpy(bunches, d_bunches, nbunches * sizeof(BunchCtx), cudaMemcpyDeviceToHost)); // collect tracks from all bunches for(int ibunch = 0; ibunch < nbunches; ibunch++) bunches[ibunch].collect_tracks(tracks); } // collect_tracks