#ifndef TRIPLETS_DYNTHRUST_H_ #define TRIPLETS_DYNTHRUST_H_ /** @file dynthrust.h thrust-like functions, but usable with dynamic parallelism, mostly asynchronous and not guaranteed to be that well optimized */ #include "util.h" /** conditional copying kernel working with dynamic parallelism */ template static __global__ void dyn_copy_if_k (To to, int *ctr, Ti from, int n, F f) { int i = threadIdx.x + blockIdx.x * blockDim.x; if(i >= n) return; if(f(from[i])) { //int pos = atomicAdd(ctr, 1); int pos = atomicAggInc(ctr); to[pos] = from[i]; } } // dyn_copy_if_k /** conditional copying with a single thread block */ template static __device__ void tblock_copy_if (To to, int *ctr, Ti from, int n, F f) { for(int i = threadIdx.x; i < n; i += blockDim.x) { if(f(from[i])) { //int pos = atomicAdd(ctr, 1); int pos = atomicAggInc(ctr); to[pos] = from[i]; } // if(f(from[i])) } // for each element } // tblock_copy_if /** conditional copying inside dynamic parallelism environment; this is a non-blocking function @param ctr element counter to be used by the function */ template static __device__ void dyn_copy_if (To to, int *ctr, Ti from, int n, const F &f, cudaStream_t s = 0) { *ctr = 0; if(n > 0) { int bs = 256; dyn_copy_if_k<<>>(to, ctr, from, n, f); cucheck_dev(cudaGetLastError()); } } // dyn_copy_if /** conditional copying called from host; this is a non-blocking function @param ctr element counter to be used by the function _on device_ */ template static void host_copy_if (To to, int *ctr, Ti from, int n, const F &f, cudaStream_t s = 0) { //*ctr = 0; cucheck(cudaMemsetAsync(ctr, 0, sizeof(int), s)); if(n > 0) { int bs = 256; dyn_copy_if_k<<>>(to, ctr, from, n, f); cucheck(cudaGetLastError()); } } // dyn_copy_if /** device function that does inclusive prefix scan in shared memory */ template static __device__ void ipre_sum(T *arr, int bs) { int tid = threadIdx.x; // increasing stride for(int s = 1; s < bs; s *= 2) { __syncthreads(); int j = (tid + 1) * 2 * s - 1; if(j < bs) arr[j] += arr[j - s]; } // decreasing stride for(int s = bs / 4; s > 0; s /= 2) { __syncthreads(); int j = (tid + 1) * 2 * s - 1; if(j + s < bs) arr[j + s] += arr[j]; } } // ipre_sum /** kernel for the first step of the exclusive prefix sum algorithm @tparam bs the size of the block within which the prefix sum is computed, also the size of the kernel */ template static __global__ void xpre_sum_block_k(T *to, T *scratch, T* from, int n) { __shared__ T arr[bs]; // load values int tid = threadIdx.x, ib = blockIdx.x, i = tid + ib * blockDim.x; if(i < n) arr[tid] = from[i]; __syncthreads(); // do in-block scan (inclusive scan) ipre_sum(arr, bs); __syncthreads(); if(i < n) { // store scratch values int isv = ib < gridDim.x - 1 ? bs - 1 : (n - 1) % bs; if(tid == 0 && scratch) scratch[ib] = arr[isv]; // store main array values if(tid > 0) to[i] = arr[tid - 1]; else to[i] = 0; //to[i] = from[i]; // store total sum value if(i == n - 1) to[n] = arr[isv]; } } // xpre_sum_step1_k /** kernel for updating total sum with scratch values */ template static __global__ void xpre_sum_update_scratch_k (T *to, const T* scratch, int n) { int tid = threadIdx.x, ib = blockIdx.x, i = tid + blockDim.x * ib; //if(ib == 0) // return; if(i < n) to[i] += scratch[ib]; if(i == n - 1) to[n] += scratch[ib]; } // xpre_sum_update_scratch_k /** exclusive prefix scan with dynamic parallelism; currently extremely sequential and extremely inefficient @param to where to store the result of the operation; should contain space for at least n elements TODO: make it parallel */ #define SCAN_BS 256 template static __device__ void dyn_xpre_sum (T *to, T *from, int n, T *scratch, cudaStream_t s = 0) { // block-wise scan int nb = divup(n, SCAN_BS); xpre_sum_block_k <<>> (to, scratch, from, n); cucheck_dev(cudaGetLastError()); // exclusive scan of scratch (assume it's just 1 block) // TODO: generalize the algorithm xpre_sum_block_k <<<1, SCAN_BS, 0, s>>>(scratch, 0, scratch, nb); cucheck_dev(cudaGetLastError()); // update blocks xpre_sum_update_scratch_k <<>>(to, scratch, n); cucheck_dev(cudaGetLastError()); // to[0] = 0; // for(int i = 1; i < n + 1; i++) // to[i] = to[i - 1] + from[i - 1]; } // dyn_xpre_sum #endif