/** @file bunch-ctx-dyn.cu.h kernels and methods for track reconstruction with
		dynamic parallelism */

/** call a specific member function of bunch for all bunches; if the passed
		function is 0, just wait for all bunches */
static void for_all_bunches
(BunchCtx *bunches, cudaStream_t *streams, int nbunches,
 void (BunchCtx::*f)(cudaStream_t), bool sync = true) {
	for(int ibunch = 0; ibunch < nbunches; ibunch++) {
		if(sync)
			cucheck(cudaStreamSynchronize(streams[ibunch]));
		if(f)
			(bunches[ibunch].*f)(streams[ibunch]);
	}
}  // for_all_bunches

/** call a specific member function of bunch for all bunches, but on a single stream */
static void for_all_bunches
(BunchCtx *bunches, cudaStream_t s, int nbunches,
 void (BunchCtx::*f)(cudaStream_t), bool sync = true) {
	if(sync)
		cucheck(cudaStreamSynchronize(s));
	if(f) {
	for(int ibunch = 0; ibunch < nbunches; ibunch++)
		(bunches[ibunch].*f)(s);
	}
}  // for_all_bunches

void reconstruct_hstream_fun(GlobalCtx *g) {
	nvtxRangePushA("with_streams");
	BunchCtx *bunches = g->bunches;
	int nbunches = g->nbunches;
	cudaStream_t *streams = (cudaStream_t *)
		malloc(nbunches * sizeof(cudaStream_t));
	for(int ibunch = 0; ibunch < nbunches; ibunch++) {
		cucheck(cudaStreamCreateWithFlags
						(&streams[ibunch], cudaStreamNonBlocking));
	}  // for(ibunch)

	nvtxRangePushA("reconstruct_hstreams");
	// do reconstruction for all bunches
	// "false" means absence of sync _before_, not after (there's no sync after
	// each call)
	reco_time_start();
	for_all_bunches(bunches, streams, nbunches, 
									&BunchCtx::find_tube_hits_hstream);
	for_all_bunches(bunches, streams, nbunches, 
									&BunchCtx::find_half_skewlets_hstream);
	for_all_bunches(bunches, streams, nbunches, 
									&BunchCtx::find_triplets_hstream, false);
	for_all_bunches(bunches, streams, nbunches, 
									&BunchCtx::find_skewlets_pre_poca_hstream);
	for_all_bunches(bunches, streams, nbunches, 
									&BunchCtx::find_skewlets_poca_hstream);
	for_all_bunches(bunches, streams, nbunches, 
									&BunchCtx::find_cand_tracks_hstream, false);
	for_all_bunches(bunches, streams, nbunches, 
									&BunchCtx::eval_cand_tracks_hstream);
	cucheck(cudaDeviceSynchronize());
	reco_time_end();
	nvtxRangePop();
	// destroy the streams
	for(int ibunch = 0; ibunch < nbunches; ibunch++)
		cucheck(cudaStreamDestroy(streams[ibunch]));
	free(streams);
	nvtxRangePop();
}  // reconstruct_hstream_fun

/** structure to be passed to thread-based reconstruction */
typedef struct { 
	/** global context */
	GlobalCtx *g;
	/** bunches to be processed */
	BunchCtx *bunches;
	/** number of bunches to be processed */
	int nbunches;
} thread_ctx_t;

/** thread function for host stream reconstruction */
void *reconstruct_tfun(void *arg) {
	thread_ctx_t *ctx = (thread_ctx_t *)arg;
	GlobalCtx *g = ctx->g;
	BunchCtx *bunches = ctx->bunches;
	int nbunches = ctx->nbunches;

	// device and stream
	cudaStream_t s;
	cucheck(cudaSetDevice(g->opts.idevice));
	cucheck(cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking));
	
	// do reconstruction with timing
	double t1 = omp_get_wtime();

	// feed many bunches into a single stream
#if 0
	for_all_bunches(bunches, s, nbunches, 
									&BunchCtx::find_tube_hits_hstream);
	for_all_bunches(bunches, s, nbunches, 
									&BunchCtx::find_half_skewlets_hstream);
	for_all_bunches(bunches, s, nbunches, 
									&BunchCtx::find_triplets_hstream);
	for_all_bunches(bunches, s, nbunches, 
									&BunchCtx::find_skewlets_pre_poca_hstream);
	for_all_bunches(bunches, s, nbunches, 
									&BunchCtx::find_skewlets_poca_hstream);
	for_all_bunches(bunches, s, nbunches, 
									&BunchCtx::find_cand_tracks_hstream);
	for_all_bunches(bunches, s, nbunches, 
									&BunchCtx::eval_cand_tracks_hstream);
	for_all_bunches(bunches, s, nbunches, 0);
#else
	// process bunches sequentially
	for(int ibunch = 0; ibunch < nbunches; ibunch++) {
	 	bunches[ibunch].find_tube_hits_hstream(s); //-> pivot_hits + skewed_hits
	 	cucheck(cudaStreamSynchronize(s));
	 	bunches[ibunch].find_half_skewlets_hstream(s); //nskewed_hits -> nhalf_skewlets
	 	//cucheck(cudaStreamSynchronize(s));
	 	bunches[ibunch].find_triplets_hstream(s); //npivot_hits -> ntriplets
	 	cucheck(cudaStreamSynchronize(s));
	 	bunches[ibunch].find_skewlets_pre_poca_hstream(s); //h_nhalf_skewlets -> npre_poca_pairs
	 	cucheck(cudaStreamSynchronize(s));
	 	bunches[ibunch].find_skewlets_poca_hstream(s); //h_npre_poca_pairs -> nskewlets
	 	//cucheck(cudaStreamSynchronize(s));
	 	bunches[ibunch].find_cand_tracks_hstream(s); // h_ntriplets
	 	cucheck(cudaStreamSynchronize(s));
	 	bunches[ibunch].eval_cand_tracks_hstream(s); 	
	}  // for(each bunch)
#endif
	cucheck(cudaStreamSynchronize(s));
	double t2 = omp_get_wtime();
	cucheck(cudaStreamDestroy(s));
	
	// return time; works on 64-bit systems only
	double t = t2 - t1;
	return *(void **)&t;
}  // reconstruct_tfun

// reconstruction with host threads
void reconstruct_hthread_fun(GlobalCtx *g) {
	// TODO: move #threads into an option
	int nthreads = 32;
	char* cudaDeviceMaxConnectionsString;
#if WITH_DYN_PAR == 1
	cudaDeviceMaxConnectionsString = getenv ("CUDA_DEVICE_MAX_CONNECTIONS");
#else
	cudaDeviceMaxConnectionsString = getenv ("OMP_NUM_THREADS");
#endif
	if ( 0 != cudaDeviceMaxConnectionsString )
		nthreads = atoi(cudaDeviceMaxConnectionsString);
	printf("Used: %d device connections and threads\n", nthreads);
	int nbunches_per_thread = divup(g->nbunches, nthreads);
	// memory for everything
	pthread_t *threads = (pthread_t *)malloc(nthreads * sizeof(pthread_t));
	thread_ctx_t *ctxs = (thread_ctx_t *)malloc(nthreads * sizeof(thread_ctx_t));

	// launch all threads
	for(int ithread = 0; ithread < nthreads; ithread++) {
		ctxs[ithread].g = g;
		int start = ithread * nbunches_per_thread;
		int end = min(g->nbunches, start + nbunches_per_thread);
		ctxs[ithread].nbunches = end - start;
		ctxs[ithread].bunches = g->bunches + start;
		pthread_create(&threads[ithread], 0, &reconstruct_tfun, &ctxs[ithread]);
	}

	// wait all threads
	double t = 0;
	double mint = 1000.0;
	double avgt = 0.0;
	for(int ithread = 0; ithread < nthreads; ithread++) {
		double tt = 0;
		pthread_join(threads[ithread], (void **)&tt);
		t = max(t, tt);
		mint = min(tt, mint);
		avgt += tt;
	}  // for(ithread)

	// free memory
	free(threads);
	free(ctxs);

  // print resulting time
	printf("total reconstruction time: %.3lf ms\n", avgt*1e3/nthreads );
	printf("reconstruction time stats: max=%.3lf ms, min=%.3lf ms, avg=%.3lf  ms\n", t * 1e3, mint* 1e3, avgt*1e3/nthreads );
}  // reconstruct_hthreads_fun

void BunchCtx::sync_back(cudaStream_t s) {
	cucheck(cudaMemcpyAsync(this, d_this, sizeof(*this), 
													cudaMemcpyDeviceToHost, s));
}  // sync_back

void BunchCtx::find_tube_hits_hstream(cudaStream_t s) {
	const GlobalCtx *g = h_global_ctx;

	// find pivot tube hits
	//host_copy_if(pivot_hits, &d_this->npivot_hits, hits, nhits, 
	//						hit_is_pivot(g->d_tubes), s);
	host_copy_if(pivot_hits, &d_this->npivot_hits, hits, nhits, 
							hit_is_pivot(g->d_tubes), s);
	// find skewed tube hits
	host_copy_if(skewed_hits, &d_this->nskewed_hits, hits, nhits,
							hit_is_skewed(g->d_tubes), s);
	sync_back(s);
}  // find_tube_hits

void BunchCtx::find_triplets_hstream(cudaStream_t s) {
	const GlobalCtx *g = h_global_ctx;
	// find triplets
	size_t ntriplets_sz = g->npivot_ranges * sizeof(int);
	cucheck(cudaMemsetAsync(ntriplets, 0, ntriplets_sz, s));
	if(npivot_hits > 0) {
		int bs = 256;
		find_triplets_k<<<divup(npivot_hits, bs), bs, 0, s>>>(d_this);
		cucheck(cudaGetLastError());
	}
	cucheck(cudaMemcpyAsync(h_ntriplets, ntriplets, ntriplets_sz,
													cudaMemcpyDeviceToHost, s));
}  // find_triplets

void BunchCtx::find_half_skewlets_hstream(cudaStream_t s) {
	// find half-skewlets
	const GlobalCtx *g = h_global_ctx;
	size_t nhalf_skewlets_sz = g->nskewed_ranges * sizeof(int);
	int bs = 256;
	if(nskewed_hits > 0) {
		find_half_skewlets_k<<<divup(nskewed_hits, bs), bs, 0, s>>>(d_this);
		cucheck(cudaGetLastError());
	}
	cucheck(cudaMemcpyAsync(h_nhalf_skewlets, nhalf_skewlets, nhalf_skewlets_sz,
													cudaMemcpyDeviceToHost, s));
}  // find_half_skewlets

void BunchCtx::find_skewlets_pre_poca_hstream(cudaStream_t s) {
	const GlobalCtx *g = h_global_ctx;
	// main skewlet test (all except poca)
	for(int irange = 0; irange < g->nskewlet_ranges; irange++) {
		int ninner = h_nhalf_skewlets[irange];
		int nouter = h_nhalf_skewlets[irange + 1];
		if(ninner == 0 || nouter == 0)
			continue;
		// main test
		dim3 bs(SKEWLETS_BSX, SKEWLETS_BSY);
		dim3 grid(divup(ninner, bs.x), divup(nouter, bs.y));
		combine_half_skewlets_k<<<grid, bs, 0, s>>>(d_this, irange);
		cucheck(cudaGetLastError());
	}  // loop over ranges
	size_t npre_poca_pairs_sz = g->nskewlet_ranges * sizeof(int);
	cucheck(cudaMemcpyAsync
					(h_npre_poca_pairs, npre_poca_pairs, npre_poca_pairs_sz,
					 cudaMemcpyDeviceToHost, s));
}  // find_skewlets_pre_poca_hstream

void BunchCtx::find_skewlets_poca_hstream(cudaStream_t s) {
	const GlobalCtx *g = h_global_ctx;
	// poca test
	for(int irange = 0; irange < g->nskewlet_ranges; irange++) {
		int npairs = h_npre_poca_pairs[irange];
		if(npairs == 0)
			continue;
		int bs = 128;
		poca_test_k<<<divup(npairs, bs), bs, 0, s>>>(d_this, irange);
		cucheck(cudaGetLastError());
	}
	size_t nskewlets_sz = g->nskewlet_ranges * sizeof(int);
	cucheck(cudaMemcpyAsync(h_nskewlets, nskewlets, nskewlets_sz,
													cudaMemcpyDeviceToHost, s));
}  // find_skewlets_poca_hstream

void BunchCtx::find_cand_tracks_hstream(cudaStream_t s) {
	const GlobalCtx *g = h_global_ctx;
	// go through all tracks
	for(int irange = 0; irange < g->npivot_ranges; irange++) 
		for(int jrange = irange + 1; jrange < g->npivot_ranges; jrange++) {
			int ni = h_ntriplets[irange], nj = h_ntriplets[jrange];
			if(ni == 0 || nj == 0)
				continue;
			int bsi = 64, bsj = 4;
			dim3 bs(bsi, bsj), grid(divup(ni, bsi), divup(nj, bsj));
			combine_triplets_k<<<grid, bs, 0, s>>>
				(d_this, irange, jrange, h_max_phis_g[irange][jrange]);
			cucheck(cudaGetLastError());
		}
	sync_back(s);
}  // find_cand_tracks_hstream

void BunchCtx::compute_track_circles(cudaStream_t s) {
	int bs = 256;
	if(ncand_tracks > 0) {
		compute_circles_k<<<divup(ncand_tracks, bs), bs, 0, s>>>
			(cand_tracks, ncand_tracks);
		cucheck(cudaGetLastError());
	}
}  // compute_track_circles

void BunchCtx::eval_cand_tracks_hstream(cudaStream_t s) {
	const GlobalCtx *g = h_global_ctx;
	int bs = 0;
	if(ncand_tracks > 0) {
		bs = 256;
		compute_circles_k<<<divup(ncand_tracks, bs), bs, 0, s>>>
			(cand_tracks, ncand_tracks);
		cucheck(cudaGetLastError());
		bs = EVAL_TRACKS_BS;
		switch(g->opts.tube_strategy()) {
		case TubeTestRaster:
			eval_tracks_tubes_raster_k<<<divup(ncand_tracks, bs), bs, 0, s>>>(d_this);
			break;
		case TubeTestShared:
			eval_tracks_tubes_shared_k<<<divup(ncand_tracks, bs), bs, 0, s>>>(d_this);
			break;
		default:
			assert(0);
		}
		cucheck(cudaGetLastError());
		eval_tracks_skewlets_k<<<divup(ncand_tracks, bs), bs, 0, s>>>(d_this);
		cucheck(cudaGetLastError());
	}
	// group all good tracks
	host_copy_if(found_tracks, &d_this->nfound_tracks, cand_tracks, 
							 ncand_tracks, is_track_good(7, 12, 10, g->d_tubes), s);
	sync_back(s);
}  // eval_cand_tracks