/** @file hough.cu Hough transform implementation file */ #include #include #include #include #include #include #include #include "hough.h" #include "util.h" Hough::Hough(const vector &eps, const HoughOptions &opts) : eps(eps), opts(opts), np(eps.size()), nevt(0), h_evt_starts(0), d_evt_starts(0), d_xs(0), d_ys(0), d_xis(0), d_yis(0), d_tracks(0), d_pntracks(0), ntracks(0), rinv(0) { } // Hough Hough::~Hough() { // free host data free(h_evt_starts); // free device data cudaFree(d_evt_starts); cudaFree(d_xs); cudaFree(d_ys); cudaFree(d_xis); cudaFree(d_yis); cudaFree(d_tracks); cudaFree(d_pntracks); } // ~Hough vector Hough::transform() { prepare(); delimit_events(); copy_to_dev(); invert_points(); hough_hough(); copy_tracks_back(); return tracks; } // transform void Hough::prepare() { // nothing currently // set CUDA limits size_t devqueue_sz; cucheck(cudaDeviceGetLimit(&devqueue_sz, cudaLimitDevRuntimePendingLaunchCount)); printf("current device queue size: %zd bytes\n", devqueue_sz); devqueue_sz *= 4; printf("new device queue size: %zd bytes\n", devqueue_sz); cucheck(cudaDeviceSetLimit(cudaLimitDevRuntimePendingLaunchCount, devqueue_sz)); } // prepare void Hough::delimit_events() { // we assume that points of each event are contiguous in the input vector evt_starts_v; for(int i = 0; i <= eps.size(); i++) { if(i == 0 || i == eps.size() || eps[i].evtid != eps[i - 1].evtid) evt_starts_v.push_back(i); } nevt = evt_starts_v.size() - 1; //fprintf(stderr, "nevt=%d\n", nevt); size_t evt_starts_sz = (nevt + 1) * sizeof(int); h_evt_starts = (int*)malloc(evt_starts_sz); copy(evt_starts_v.begin(), evt_starts_v.end(), h_evt_starts); // print every 100 host event starts // for(int ievt = 0; ievt <= nevt; ievt += 100) // printf("evt_starts[%d] = %d\n", ievt, h_evt_starts[ievt]); // also compute inverted radius float r2min = numeric_limits::infinity(); for(int i = 0; i < eps.size(); i++) { float x = eps[i].x, y = eps[i].y; r2min = min(r2min, x * x + y * y); } int nr = opts.ngrid; rinv = nr / ((nr + 1) / (nr * sqrt(r2min))); //fprintf(stderr, "preinv_rmin=%lf, rinv=%lf\n", (double)sqrt(r2min), (double)rinv); } // delimit_events void Hough::copy_to_dev() { // allocate device memory size_t evt_starts_sz = (nevt + 1) * sizeof(int); size_t ps_sz = np * sizeof(float); size_t tracks_sz = opts.nmax_tracks * sizeof(Track); cucheck(cudaMalloc((void**)&d_evt_starts, evt_starts_sz)); cucheck(cudaMalloc((void**)&d_xs, ps_sz)); cucheck(cudaMalloc((void**)&d_ys, ps_sz)); cucheck(cudaMalloc((void**)&d_xis, ps_sz)); cucheck(cudaMalloc((void**)&d_yis, ps_sz)); cucheck(cudaMalloc((void**)&d_tracks, tracks_sz)); cucheck(cudaMalloc((void**)&d_pntracks, sizeof(int))); // copy data to device cucheck(cudaMemcpy(d_evt_starts, h_evt_starts, evt_starts_sz, cudaMemcpyHostToDevice)); // extract points from device float* h_xs = (float *)malloc(ps_sz); float* h_ys = (float *)malloc(ps_sz); for(int i = 0; i < np; i++) { h_xs[i] = eps[i].x; h_ys[i] = eps[i].y; } cucheck(cudaMemcpy(d_xs, h_xs, ps_sz, cudaMemcpyHostToDevice)); cucheck(cudaMemcpy(d_ys, h_ys, ps_sz, cudaMemcpyHostToDevice)); free(h_xs); free(h_ys); } // copy_to_dev void __global__ invert_points_k (float *xis, float *yis, const float *xs, const float *ys, int np) { int i = threadIdx.x + blockIdx.x * blockDim.x; if(i < np) { float x = xs[i], y = ys[i]; float invr = 1 / (x * x + y * y); float xi = x * invr, yi = y * invr; xis[i] = xi; yis[i] = yi; } } // invert_points_k void Hough::invert_points() { // invert points, device-side int bs = 256; invert_points_k<<>>(d_xis, d_yis, d_xs, d_ys, np); } // invert_points #define HOUGH_BS 128 #define HOUGH_PS 256 /** the kernel to be launched for each HT; this is a 1-block kernel */ template __global__ void hough_hough_k (Track *tracks, /** tracks will be written here */ int *pntracks, /** track counter, set to 0 at start */ float *xis, /** x point coordinates after inversion */ float *yis, /** y point coordinates after inversion */ int ievt_base, /** base event id */ int *evt_starts, /** array of event starts */ int depth, /** HT depth */ float rmin, /** minimum radius for HT */ float rstep, /** radius step for HT */ float rinv, /** multiplier for HT radius to get index */ float amin, /** minimum angle for HT */ float astep, /** angle step */ HoughOptions opts /** additional HT options */ ) { // shared memory data __shared__ float lxis[HOUGH_PS], lyis[HOUGH_PS]; // point coordinates __shared__ int hbins[HOUGH_NR]; // hough bins __shared__ float lsina, lcosa, lsina1, lcosa1; // shared angle sine/cosine values //int na = opts.ngrid, nr = HOUGH_NR, np = evtnp; bool is_inner = depth < opts.max_depth; int tid = threadIdx.x, bs = blockDim.x; // thread id & block size int na = opts.ngrid, nr = opts.ngrid; int ievt = ievt_base + blockIdx.x; int evt_start = evt_starts[ievt]; int np = evt_starts[ievt + 1] - evt_start; // load points for(int i = tid; i < np; i += bs) { lxis[i] = xis[evt_start + i]; lyis[i] = yis[evt_start + i]; } __syncthreads(); // iterate over angle values for(int ia = 0; ia < na; ia++) { float alpha = amin + ia * astep; float alpha1 = alpha + astep; // evaluate sin/cos, store in shared memory if(!tid) { lsina = sin(alpha); lcosa = cos(alpha); lsina1 = sin(alpha1); lcosa1 = cos(alpha1); } // clean up bins for(int ir = tid; ir < HOUGH_NR; ir += bs) hbins[ir] = 0; __syncthreads(); // evaluate HT for current bins float sina = lsina, cosa = lcosa, sina1 = lsina1, cosa1 = lcosa1; // float sina = sin(alpha), cosa = cos(alpha); for(int i = tid; i < np; i += bs) { float r = lxis[i] * cosa + lyis[i] * sina; float r1 = lxis[i] * cosa1 + lyis[i] * sina1; int ir = (int)floor((r - rmin) * rinv); int ir1 = (int)floor((r1 - rmin) * rinv); // no anti-aliasing for leaves // if(!is_inner) // ir1 = ir; int irstart = min(ir, ir1), irend = max(ir, ir1); for(int irc = irstart; irc <= irend; irc++) { if(irc >= 0 && irc < nr) atomicAdd(hbins + irc, 1); } } // for each point __syncthreads(); // analyze HT results //int max_depth = opts.max_depth; int threshold = is_inner ? opts.node_threshold : opts.leaf_threshold; for(int ir = tid; ir < nr; ir += bs) { int binval = hbins[ir]; if(binval > threshold) { if(is_inner) { // zoom in float new_rmin = rmin + ir * rstep; float new_rstep = rstep / nr; float new_rinv = rinv * nr; float new_astep = astep / na; hough_hough_k <<<1, HOUGH_BS>>> (tracks, pntracks, xis, yis, ievt, evt_starts, depth + 1, new_rmin, new_rstep, new_rinv, alpha, new_astep, opts); cudaError_t err = cudaGetLastError(); if(err != cudaSuccess) { printf("problem launching HT for event %d, depth %d: %s\n", ievt, depth + 1, cudaGetErrorString(err)); } } else { // found a track, store if we have a place for it int it = atomicAdd(pntracks, 1); if(it < opts.nmax_tracks) { float r = rmin + ir * rstep; float invr = 1 / (2 * r); float x0 = cosa * invr, y0 = sina * invr; //printf("track %d: (%lf, %lf, %lf)\n", it, (double)x0, (double)y0, // (double)invr); tracks[it] = Track(x0, y0, invr, binval, ievt); } } // if(not reached maximum depth) } // if(more than threshold) } // for(each radius bin) if(is_inner) if(cudaDeviceSynchronize() != cudaSuccess) { printf("problems synching on kernels started for event %d, depth %d", ievt, depth + 1); } __syncthreads(); } // for all angle values } // hough_hough_k /** the kernel which launches top-level HT for each event */ template __global__ void event_hough_k (Track *tracks, int *pntracks, float *xis, float *yis, int *evt_starts, /** event starts */ int nevt, /** event starts */ float rinv, /** multiplier for HT radius to get index */ HoughOptions opts /** additional HT options */ ) { // just start one HT kernel per event int ievt = threadIdx.x + blockIdx.x * blockDim.x; if(ievt < nevt) { int na = opts.ngrid; // start a 1-block HT kernel with dynamic parallelism hough_hough_k <<<1, HOUGH_BS>>> (tracks, pntracks, xis, yis, ievt, evt_starts, 1, 0.0f, 1 / rinv, rinv, 0.0f, 2.0f * M_PI / na, opts); // check for success cudaError_t err = cudaGetLastError(); if(err != cudaSuccess) { // report a launch error printf("problem launching kernel for event %d: %s\n", ievt, cudaGetErrorString(err)); } // if(error handling) } } // event_hough_k void Hough::hough_hough() { // reset track counter cucheck(cudaMemset(d_pntracks, 0, sizeof(int))); // call event processing kernel //int bs = 256; // event_hough_k <<>> // (d_tracks, d_pntracks, d_xis, d_yis, d_evt_starts, nevt, rinv, opts); // measure time double t1 = omp_get_wtime(); int na = opts.ngrid; hough_hough_k <<>> (d_tracks, d_pntracks, d_xis, d_yis, 0, d_evt_starts, 1, 0.0f, 1 / rinv, rinv, 0.0f, 2.0f * M_PI / na, opts); cucheck(cudaThreadSynchronize()); double t2 = omp_get_wtime(); printf("Hough time: %lf s\n", t2 - t1); } // hough_hough void Hough::copy_tracks_back() { // copy track count cucheck(cudaMemcpy(&ntracks, d_pntracks, sizeof(int), cudaMemcpyDeviceToHost)); ntracks = min(ntracks, opts.nmax_tracks); fprintf(stderr, "ntracks=%d\n", ntracks); // copy tracks Track *h_tracks = (Track*)malloc(ntracks * sizeof(Track)); cucheck(cudaMemcpy(h_tracks, d_tracks, ntracks * sizeof(Track), cudaMemcpyDeviceToHost)); tracks.insert(tracks.end(), h_tracks, h_tracks + ntracks); free(h_tracks); } // copy_tracks_back vector hough_transform (const vector &eps, const HoughOptions &opts) { Hough hough(eps, opts); return hough.transform(); } // hough_transform