#include #include #include "cuda_runtime.h" #include namespace debug { void printArray(float *a, int aLength) { for (int i = 0; i < aLength; i++) { std::cout << a[i] << std::endl; } } void printTwoArraysNextToEachOther(float *a, float *b, int aLengths) { for (int i = 0; i < aLengths; i++) { std::cout << a[i] << ", " << b[i] << std::endl; } } } __device__ float degToRad() { return 3.14159265358979323846 / 180.0; } __global__ void GenerateAngles_Gpu_Kernel(float * angles, float angleStart, float angleStepSize, int totalAmountOfSteps) { int idx = blockIdx.x * blockDim.x + threadIdx.x; printf("idx = %i\n", idx); printf("totalAmountOfSteps = %i\n", totalAmountOfSteps); if (idx < totalAmountOfSteps) { float currentValue = angleStart + angleStepSize * idx; angles[idx] = currentValue * degToRad(); printf("angles[idx] = %f\n", angles[idx]); } } float * GenerateAngles_Gpu(float angleStart, float angleEnd, float angleStepSize) { // e.g. angleStart = 0, angleEnd = 360, angleStepSize = 0.5 std::cout << " -- cudaHough::GenerateAngles_Gpu -- " << std::endl; int totalAmountOfSteps = (angleEnd - angleStart) / angleStepSize; float * angles_d = new float[totalAmountOfSteps]; // device array with space for all the angles // TODO: Take care of borders - is 0..n-1 really the right amount of angles? float * angles_h = new float[totalAmountOfSteps]; // host array for angles: TODO make another function which doesnt need transferring back angle data size_t arrayLength = totalAmountOfSteps * sizeof(float); cudaMalloc((void **) & angles_d, arrayLength); unsigned int nThreads = 32; // what about this value?! unsigned int nBlocks = totalAmountOfSteps / nThreads + (totalAmountOfSteps % nThreads == 0 ? 0 : 1); std::cout << "DEBUG output" << std::endl; std::cout << "angleStart = " << angleStart << ", angleStepSize = " << angleStepSize << ", totalAmountOfSteps = " << totalAmountOfSteps << ", nBlocks = " << nBlocks << ", nThreads = " << nThreads << std::endl; GenerateAngles_Gpu_Kernel<<>>(angles_d, angleStart, angleStepSize, totalAmountOfSteps); // cudaMemcpy(angles_h, angles_d, arrayLength, cudaMemcpyDeviceToHost); // // cudaFree(angles_d); // std::cout << "Printing content of GPU-side generated angles:" << std::endl; // debug::printArray(angles_h, totalAmountOfSteps); // return angles_h; return angles_d; } void __global__ myKernel(float * x, float * y, int nHits, float * angles, int nAngles, float * houghX, float * houghY) { int idx = blockIdx.x * blockDim.x + threadIdx.x; int nHit = (int)((float)idx / (float)nAngles); if (idx < nHits * nAngles) { printf("idx = %i, nangles = %i, hit = %i, x[nHit] = %f\n", idx, nAngles, nHit, x[nHit]); houghX[idx] = (float) nHit; } } std::vector > arrayOfHitsToVectorOfHits(float * array, int nHits, int nAngles) { std::vector > tempOuterVector; for (int i = 0; i < nHits; ++i) { std::vector tempInnerVector(array + i * nAngles, array + (i + 1) * nAngles); tempOuterVector.push_back(tempInnerVector); } return tempOuterVector; } template float * convertVectorToArray(T & vec, int * arrayLength) { *arrayLength = vec.size(); return & vec[0]; } int main() { float start = 0; float end = 180; float stepSize = 5; int totalAmountOfSteps = (end - start) / stepSize; int nAngles = totalAmountOfSteps; // std::vector x, y; // x.push_back(1.); y.push_back(1.); // x.push_back(2.); y.push_back(2.); std::vector vX; vX.push_back(1.); vX.push_back(2.); vX.push_back(5.); int nHits = 3; // float * x = new float[nHits]; // x[0] = 1.; // x[1] = 2.; // x[2] = 3.; int nXHits; float * x = convertVectorToArray(vX, &nXHits); float * y = new float[nHits]; y[0] = 1.; y[1] = 2.; y[2] = 3.; float * x_d = new float[nHits]; float * y_d = new float[nHits]; size_t hitArrayLength = nHits * sizeof(float); cudaMalloc((void **) & x_d, hitArrayLength); cudaMalloc((void **) & y_d, hitArrayLength); cudaMemcpy(x_d, x, hitArrayLength, cudaMemcpyHostToDevice); cudaMemcpy(y_d, y, hitArrayLength, cudaMemcpyHostToDevice); float * angles_d = GenerateAngles_Gpu(start, end, stepSize); float * houghX_d = new float[nAngles * nHits]; // DEVICE array of circle center xs float * houghY_d = new float[nAngles * nHits]; // DEVICE array of circle center ys size_t houghArrayLength = nHits * nAngles * sizeof(float); cudaMalloc((void **) & houghX_d, houghArrayLength); cudaMalloc((void **) & houghY_d, houghArrayLength); std::cout << "Launching Hough kernel" << std::endl; unsigned int nThreads = 32; unsigned int nBlocks = nHits * nAngles / nThreads + ((nHits * nAngles) % nThreads == 0 ? 0 : 1); myKernel<<>>(x_d, y_d, nHits, angles_d, nAngles, houghX_d, houghY_d); float * houghX_h = new float[nAngles * nHits]; float * houghX_y = new float[nAngles * nHits]; cudaMemcpy(houghX_h, houghX_d, houghArrayLength, cudaMemcpyDeviceToHost); std::cout << "nHits * nAngles = " << nHits * nAngles << std::endl; debug::printArray(houghX_h, nHits * nAngles); std::vector > vOfHits = arrayOfHitsToVectorOfHits(houghX_h, nHits, nAngles); // float * angles_h = new float[totalAmountOfSteps]; // size_t arrayLength = totalAmountOfSteps * sizeof(float); // cudaMemcpy(angles_h, myClass->fTest, arrayLength, cudaMemcpyDeviceToHost); // debug::printArray(angles_h, totalAmountOfSteps); return 0; }