From d20eab80b8ee6df2b7e0a0b1aee8300d30fd421d Mon Sep 17 00:00:00 2001 From: Jiri Kraus Date: Thu, 20 Mar 2014 10:30:08 +0100 Subject: [PATCH] Added NVTX instrumentation --- Makefile | 6 +- Makefile.inc | 11 +-- include/transforms/harmonicfolder.hpp | 3 + include/utils/nvtx.hpp | 24 ++++++ src/pipeline_multi.cpp | 101 ++++++++++++++------------ 5 files changed, 89 insertions(+), 56 deletions(-) create mode 100644 include/utils/nvtx.hpp diff --git a/Makefile b/Makefile index 02f5e99..b812a44 100644 --- a/Makefile +++ b/Makefile @@ -14,13 +14,13 @@ DEBUG = # Includes and libraries INCLUDE = -I$(INCLUDE_DIR) -I$(THRUST_DIR) -I${DEDISP_DIR}/include -I${CUDA_DIR}/include -I./tclap -LIBS = -L$(CUDA_DIR)/lib64 -lcuda -lcudart -L${DEDISP_DIR}/lib -ldedisp -lcufft -lpthread +LIBS = -L$(CUDA_DIR)/lib64 -lcuda -lcudart -L${DEDISP_DIR}/lib -ldedisp -lcufft -lpthread -lnvToolsExt # compiler flags # --compiler-options -Wall NVCC_COMP_FLAGS = -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_30,code=sm_30 -NVCCFLAGS = ${OPTIMISE} ${NVCC_COMP_FLAGS} --machine 64 -Xcompiler ${DEBUG} -CFLAGS = -fPIC ${OPTIMISE} ${DEBUG} +NVCCFLAGS = ${UCFLAGS} ${OPTIMISE} ${NVCC_COMP_FLAGS} --machine 64 -Xcompiler ${DEBUG} +CFLAGS = ${UCFLAGS} -fPIC ${OPTIMISE} ${DEBUG} OBJECTS = ${OBJ_DIR}/kernels.o EXE_FILES = ${BIN_DIR}/peasoup ${BIN_DIR}/resampling_test ${BIN_DIR}/harmonic_sum_test diff --git a/Makefile.inc b/Makefile.inc index 7c6db24..8de7f5a 100644 --- a/Makefile.inc +++ b/Makefile.inc @@ -1,14 +1,15 @@ # This is where common definitions go #cuda setup -CUDA_DIR = /usr/local/cuda-5.0/ -THRUST_DIR = /usr/local/cuda-5.0/include/ +CUDA_DIR = $(CUDAROOT) +THRUST_DIR = $(CUDAROOT)/include/ #dedisp setup -DEDISP_DIR = /mnt/home/ebarr/Soft/dedisp +DEDISP_DIR = /homeb/zam/jkraus/workspace/PulsarSearch/dedisp GCC = gcc GXX = g++ AR = ar -NVCC = /usr/local/cuda-5.0/bin/nvcc -SHELL = /bin/csh \ No newline at end of file +NVCC = $(CUDAROOT)/bin/nvcc +SHELL = /bin/bash +UCFLAGS = -DUSE_NVTX \ No newline at end of file diff --git a/include/transforms/harmonicfolder.hpp b/include/transforms/harmonicfolder.hpp index ae8dedb..b370f4e 100644 --- a/include/transforms/harmonicfolder.hpp +++ b/include/transforms/harmonicfolder.hpp @@ -3,6 +3,7 @@ #include #include #include +#include class HarmonicFolder { private: @@ -24,6 +25,7 @@ class HarmonicFolder { void fold(DevicePowerSpectrum& fold0) { + PUSH_NVTX_RANGE("Harmonic summing",2) float** h_data_ptrs; float** d_data_ptrs; Utils::device_malloc(&d_data_ptrs,sums.size()); @@ -37,6 +39,7 @@ class HarmonicFolder { device_harmonic_sum(fold0.get_data(),d_data_ptrs, fold0.get_nbins(),sums.size(), max_blocks,max_threads); + POP_NVTX_RANGE } ~HarmonicFolder() diff --git a/include/utils/nvtx.hpp b/include/utils/nvtx.hpp new file mode 100644 index 0000000..dd5a7c2 --- /dev/null +++ b/include/utils/nvtx.hpp @@ -0,0 +1,24 @@ +#pragma once +#ifdef USE_NVTX +#include "nvToolsExt.h" + +static const uint32_t colors[] = { 0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff }; +static const int num_colors = sizeof(colors)/sizeof(uint32_t); + +#define PUSH_NVTX_RANGE(name,cid) { \ + int color_id = cid; \ + color_id = color_id%num_colors;\ + nvtxEventAttributes_t eventAttrib = {0}; \ + eventAttrib.version = NVTX_VERSION; \ + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ + eventAttrib.colorType = NVTX_COLOR_ARGB; \ + eventAttrib.color = colors[color_id]; \ + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ + eventAttrib.message.ascii = name; \ + nvtxRangePushEx(&eventAttrib); \ +} +#define POP_NVTX_RANGE nvtxRangePop(); +#else +#define PUSH_NVTX_RANGE(name,cid) +#define POP_NVTX_RANGE +#endif diff --git a/src/pipeline_multi.cpp b/src/pipeline_multi.cpp index 57999f0..5ae44c4 100644 --- a/src/pipeline_multi.cpp +++ b/src/pipeline_multi.cpp @@ -134,100 +134,103 @@ class Worker { float padding_mean; int ii; + PUSH_NVTX_RANGE("DM-Loop",0) while (true){ ii = manager.get_dm_trial_idx(); if (ii==-1) - break; + break; trials.get_idx(ii,tim); if (args.verbose) - std::cout << "Copying DM trial to device (DM: " << tim.get_dm() << ")"<< std::endl; + std::cout << "Copying DM trial to device (DM: " << tim.get_dm() << ")"<< std::endl; d_tim.copy_from_host(tim); if (padding){ - padding_mean = stats::mean(d_tim.get_data(),trials.get_nsamps()); - d_tim.fill(trials.get_nsamps(),d_tim.get_nsamps(),padding_mean); + padding_mean = stats::mean(d_tim.get_data(),trials.get_nsamps()); + d_tim.fill(trials.get_nsamps(),d_tim.get_nsamps(),padding_mean); } if (args.verbose) - std::cout << "Generating accelration list" << std::endl; + std::cout << "Generating accelration list" << std::endl; acc_plan.generate_accel_list(tim.get_dm(),acc_list); if (args.verbose) - std::cout << "Searching "<< acc_list.size()<< " acceleration trials for DM "<< tim.get_dm() << std::endl; + std::cout << "Searching "<< acc_list.size()<< " acceleration trials for DM "<< tim.get_dm() << std::endl; if (args.verbose) - std::cout << "Executing forward FFT" << std::endl; + std::cout << "Executing forward FFT" << std::endl; r2cfft.execute(d_tim.get_data(),d_fseries.get_data()); if (args.verbose) - std::cout << "Forming power spectrum" << std::endl; + std::cout << "Forming power spectrum" << std::endl; former.form(d_fseries,pspec); if (args.verbose) - std::cout << "Finding running median" << std::endl; + std::cout << "Finding running median" << std::endl; rednoise.calculate_median(pspec); if (args.verbose) - std::cout << "Dereddening Fourier series" << std::endl; + std::cout << "Dereddening Fourier series" << std::endl; rednoise.deredden(d_fseries); if (args.zapfilename!=""){ - if (args.verbose) - std::cout << "Zapping birdies" << std::endl; - bzap->zap(d_fseries); + if (args.verbose) + std::cout << "Zapping birdies" << std::endl; + bzap->zap(d_fseries); } if (args.verbose) - std::cout << "Forming interpolated power spectrum" << std::endl; + std::cout << "Forming interpolated power spectrum" << std::endl; former.form_interpolated(d_fseries,pspec); if (args.verbose) - std::cout << "Finding statistics" << std::endl; + std::cout << "Finding statistics" << std::endl; stats::stats(pspec.get_data(),size/2+1,&mean,&rms,&std); if (args.verbose) - std::cout << "Executing inverse FFT" << std::endl; + std::cout << "Executing inverse FFT" << std::endl; c2rfft.execute(d_fseries.get_data(),d_tim.get_data()); - CandidateCollection accel_trial_cands; + CandidateCollection accel_trial_cands; + PUSH_NVTX_RANGE("Acceleration-Loop",1) for (int jj=0;jj trials = dedisperser.dedisperse(); + POP_NVTX_RANGE timers["dedispersion"].stop(); if (args.progress_bar)