/** * @file CircleHough.cu * @brief Core of the GPU Circle Hough * @details The core functions, including kernels and stuff, for the GPU Circle Hough * * To make the file working both standalone and PandaRoot-included, some parts are steered by compiler definitions. * * @date 31 Jul 2015 * @author Andreas Herten * @author Ludovico Bianchi */ #define VERBOSE 1 #include #include #include #include #include #include "cuda_runtime.h" #include "cuda_profiler_api.h" // for creating profiler regions (http://docs.nvidia.com/cuda/profiler-users-guide/index.html#focusing-profiling) #include "PndGpu.cu" #ifdef THERE_IS_ROOT #include "TVector2.h" #else using namespace PndGpu::fake; #endif bool doTiming = false; __device__ float degToRad() { return 3.14159265358979323846 / 180.0; } __global__ void GenerateAngles_Gpu_Kernel(float *angles, float angleStart, float angleStepSize, int totalAmountOfSteps) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < totalAmountOfSteps) // CUDA blocks might be too big on the borders { float currentValue = angleStart + angleStepSize * idx; angles[idx] = currentValue * degToRad(); } } __global__ void IsoCalc_Gpu_Kernel(float *angles, float *houghX, float *houghY, int nAlphas, float x, float y, float r) { int idx = blockIdx.x * blockDim.x + threadIdx.x; // if (idx % 100) printf("I'm on a GPU\n"); if (idx < nAlphas) { float sinVal, cosVal; sincosf(angles[idx], &sinVal, &cosVal); float s = (r * r - x * x - y * y) / (2 * (x * cosVal + y * sinVal + r)); houghX[idx] = x + s * cosVal; houghY[idx] = y + s * sinVal; } } #ifndef CH_STANDALONE extern "C" #endif float * GenerateAngles_Gpu(float angleStart, float angleEnd, float angleStepSize) { // create context to initialize CUDA environment cudaFree(0); // beginning of profiling region cudaProfilerStart(); // e.g. angleStart = 0, angleEnd = 360, angleStepSize = 0.5 std::cout << " -- cudaHough::GenerateAngles_Gpu -- " << std::endl; int totalAmountOfSteps = (angleEnd - angleStart) / angleStepSize; float * angles_d; // device array with space for all the angles // TODO: Take care of borders - is 0..n-1 really the right amount of angles? size_t arrayLength = totalAmountOfSteps * sizeof(float); cucheck(cudaMalloc((void **) & angles_d, arrayLength)); // cucheck(cudaMallocHost((void **) & angles_h, arrayLength)); unsigned int nThreads = 32; // what about this value?! unsigned int nBlocks = totalAmountOfSteps / nThreads + (totalAmountOfSteps % nThreads == 0 ? 0 : 1); std::cout << "DEBUG output" << std::endl; std::cout << "angleStart = " << angleStart << ", angleStepSize = " << angleStepSize << ", totalAmountOfSteps = " << totalAmountOfSteps << ", nBlocks = " << nBlocks << ", nThreads = " << nThreads << std::endl; cudaEvent_t start, stop; if (doTiming) { cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start); } GenerateAngles_Gpu_Kernel<<>>(angles_d, angleStart, angleStepSize, totalAmountOfSteps); if (doTiming) { cudaEventRecord(stop); cudaEventSynchronize(stop); float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop); // std::cout << " -- cudaHough::GenerateAngles_Gpu -- Time for generating angles = " << milliseconds << " ms." << std::endl; cudaEventDestroy(start); cudaEventDestroy(stop); } // cudaMemcpy(angles_h, angles_d, arrayLength, cudaMemcpyDeviceToHost); // // cudaFree(angles_d); // std::cout << "Printing content of GPU-side generated angles:" << std::endl; // PndGpu::debug::printArray(angles_h, totalAmountOfSteps); // return angles_h; // cucheck(cudaFreeHost(angles_h)); // cucheck(cudaFree(angles_d)); // end of profiling region cudaProfilerStop(); return angles_d; } __global__ void IsoCalc_allHits_Gpu_Kernel(float *x, float *y, float *r, int nHits, float *angles, int nAngles, float *houghX, float *houghY) { // printf("GPU!\n"); int idx = blockIdx.x * blockDim.x + threadIdx.x; int currentHit = (int)((float)idx / (float)nAngles); if (idx < nHits * nAngles) { // if (idx % 1000) printf("I'm on a GPU!\n"); float currentX = x[currentHit]; float currentY = y[currentHit]; float currentR = r[currentHit]; int currentAngleIndex = idx - currentHit * nAngles; float sinVal, cosVal; sincosf(angles[currentAngleIndex], &sinVal, &cosVal); float s = (currentR * currentR - currentX * currentX - currentY * currentY) / (2 * (currentX * cosVal + currentY * sinVal + currentR)); houghX[idx] = currentX + s * cosVal; houghY[idx] = currentY + s * sinVal; } } #ifndef CH_STANDALONE extern "C" #endif std::vector > IsoCalc_allHits_Gpu(float *angles_d, int nAngles, std::vector x, std::vector y, std::vector r) // the std vector could as well be PndGeneralHoughHits directly { // beginning of profiling region cudaProfilerStart(); // convert hit point std::vectors to arrays int nHits; float *x_h = PndGpu::conversion::vectorToArray(x, & nHits); // HOST array of all hit point x coords float *y_h = PndGpu::conversion::vectorToArray(y, & nHits); // HOST array of all hit point y coords float *r_h = PndGpu::conversion::vectorToArray(r, & nHits); // HOST array of all hit point r coords std::cout << "nHits = " << nHits << std::endl; // allocate device memory and copy hits into it float *x_d; // DEVICE array of all hit point x coords float *y_d; // DEVICE array of all hit point y coords float *r_d; // DEVICE array of all hit point r coords size_t hitArrayLength = nHits * sizeof(float); cucheck(cudaMalloc((void **) & x_d, hitArrayLength)); cucheck(cudaMalloc((void **) & y_d, hitArrayLength)); cucheck(cudaMalloc((void **) & r_d, hitArrayLength)); cucheck(cudaMemcpy(x_d, x_h, hitArrayLength, cudaMemcpyHostToDevice)); cucheck(cudaMemcpy(y_d, y_h, hitArrayLength, cudaMemcpyHostToDevice)); cucheck(cudaMemcpy(r_d, r_h, hitArrayLength, cudaMemcpyHostToDevice)); // allocate memory for hough-transformed values, both on device and host float *houghX_h; // HOST array of circle center xs float *houghY_h; // HOST array of circle center ys float *houghX_d; // DEVICE array of circle center xs float *houghY_d; // DEVICE array of circle center ys size_t houghArrayLength = nHits * nAngles * sizeof(float); // the array is nHits times longer than the one in the 'one hit at a time'-kernel version cucheck(cudaMallocHost((void **) & houghX_h, houghArrayLength)); cucheck(cudaMallocHost((void **) & houghY_h, houghArrayLength)); cucheck(cudaMalloc((void **) & houghX_d, houghArrayLength)); cucheck(cudaMalloc((void **) & houghY_d, houghArrayLength)); // configure kernel calls and launch kernel unsigned int nThreads = 256; unsigned int nBlocks = nHits * nAngles / nThreads + ((nHits * nAngles) % nThreads == 0 ? 0 : 1); std::cout << "nThreads = " << nThreads << ", nBlocks = " << nBlocks << std::endl; // insert direct timing here! cudaEvent_t start, stop; float milliseconds = 0; if (doTiming) { cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start); } IsoCalc_allHits_Gpu_Kernel <<< nBlocks, nThreads>>>(x_d, y_d, r_d, nHits, angles_d, nAngles, houghX_d, houghY_d); // cucheck(cudaGetLastError()); if (doTiming) { cudaEventRecord(stop); cudaEventSynchronize(stop); cudaEventElapsedTime(&milliseconds, start, stop); std::cout << " -- cudaHough::IsoCalc_allHits_Gpu_Kernel -- Time for generating hough values = " << milliseconds << " ms." << std::endl; cucheck(cudaEventDestroy(start)); cucheck(cudaEventDestroy(stop)); } cucheck(cudaGetLastError()); // copy back (dev --> host) fully filled hough arrays // cucheck(cudaMemcpy(houghX_h, houghX_d, houghArrayLength, cudaMemcpyDeviceToHost)); // cucheck(cudaMemcpy(houghY_h, houghY_d, houghArrayLength, cudaMemcpyDeviceToHost)); cucheck(cudaMemcpyAsync(houghX_h, houghX_d, houghArrayLength, cudaMemcpyDeviceToHost)); cucheck(cudaMemcpyAsync(houghY_h, houghY_d, houghArrayLength, cudaMemcpyDeviceToHost)); // end of profiling region std::vector > tempVector = PndGpu::conversion::splitArraysAndConvertToTVector(houghX_h, houghY_h, nHits, nAngles); cucheck(cudaFree(x_d)); cucheck(cudaFree(y_d)); cucheck(cudaFree(r_d)); cucheck(cudaFree(houghX_d)); cucheck(cudaFree(houghY_d)); cucheck(cudaFreeHost(houghX_h)); cucheck(cudaFreeHost(houghY_h)); // split arrays hit-wise and convert to TVectors cudaProfilerStop(); return tempVector; }