Skip to content

Commit

Permalink
GPU: Add tpcApplyCFCutsAtDecoding option to apply cluster cuts of CF …
Browse files Browse the repository at this point in the history
…during CTF decoding
  • Loading branch information
davidrohr committed Oct 14, 2024
1 parent 1ff58c4 commit d56183e
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 26 deletions.
1 change: 1 addition & 0 deletions GPU/GPUTracking/Definitions/GPUSettingsList.h
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,7 @@ AddOption(tpcSingleSector, int32_t, -1, "", 0, "Restrict TPC processing to a sin
AddOption(tpcDownscaledEdx, uint8_t, 0, "", 0, "If != 0, downscale dEdx processing (if enabled) to x %")
AddOption(tpcMaxAttachedClustersPerSectorRow, uint32_t, 51000, "", 0, "Maximum number of TPC attached clusters which can be decoded per SectorRow")
AddOption(tpcUseOldCPUDecoding, bool, false, "", 0, "Enable old CPU-based TPC decoding")
AddOption(tpcApplyCFCutsAtDecoding, bool, false, "", 0, "Apply cluster cuts from clusterization during decoding of compressed clusters")
AddOption(RTCcacheFolder, std::string, "./rtccache/", "", 0, "Folder in which the cache file is stored")
AddOption(RTCprependCommand, std::string, "", "", 0, "Prepend RTC compilation commands by this string")
AddOption(RTCoverrideArchitecture, std::string, "", "", 0, "Override arhcitecture part of RTC compilation command line")
Expand Down
7 changes: 3 additions & 4 deletions GPU/GPUTracking/Global/GPUChain.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,31 +59,30 @@ class GPUChain

const GPUParam& GetParam() const { return mRec->mHostConstantMem->param; }
const GPUSettingsGRP& GetGRPSettings() const { return mRec->mGRPSettings; }
const GPUSettingsDeviceBackend& GetDeviceBackendSettings() const { return mRec->mDeviceBackendSettings; }
const GPUSettingsProcessing& GetProcessingSettings() const { return mRec->mProcessingSettings; }
const GPUCalibObjectsConst& calib() const { return processors()->calibObjects; }
GPUReconstruction* rec() { return mRec; }
const GPUReconstruction* rec() const { return mRec; }
inline const GPUConstantMem* GetProcessors() { return mRec->processors(); }

// Make functions from GPUReconstruction*** available
GPUReconstruction::RecoStepField GetRecoSteps() const { return mRec->GetRecoSteps(); }
GPUReconstruction::RecoStepField GetRecoStepsGPU() const { return mRec->GetRecoStepsGPU(); }
GPUReconstruction::InOutTypeField GetRecoStepsInputs() const { return mRec->GetRecoStepsInputs(); }
GPUReconstruction::InOutTypeField GetRecoStepsOutputs() const { return mRec->GetRecoStepsOutputs(); }
inline const GPUSettingsDeviceBackend& GetDeviceBackendSettings() const { return mRec->mDeviceBackendSettings; }
inline const GPUSettingsProcessing& GetProcessingSettings() const { return mRec->mProcessingSettings; }

protected:
GPUReconstructionCPU* mRec;
GPUChain(GPUReconstruction* rec) : mRec((GPUReconstructionCPU*)rec) {}

int32_t GetThread();

// Make functions from GPUReconstruction*** available
inline GPUConstantMem* processors() { return mRec->processors(); }
inline GPUConstantMem* processorsShadow() { return mRec->mProcessorsShadow; }
inline GPUConstantMem* processorsDevice() { return mRec->mDeviceConstantMem; }
inline GPUParam& param() { return mRec->param(); }
inline const GPUConstantMem* processors() const { return mRec->processors(); }
inline GPUSettingsProcessing& ProcessingSettings() { return mRec->mProcessingSettings; }
inline void SynchronizeStream(int32_t stream) { mRec->SynchronizeStream(stream); }
inline void SynchronizeEvents(deviceEvent* evList, int32_t nEvents = 1) { mRec->SynchronizeEvents(evList, nEvents); }
inline void SynchronizeEventAndRelease(deviceEvent& ev, bool doGPU = true)
Expand Down
10 changes: 7 additions & 3 deletions GPU/GPUTracking/Global/GPUChainTracking.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ bool GPUChainTracking::ValidateSettings()
GPUError("Must use external output for double pipeline mode");
return false;
}
if (ProcessingSettings().tpcCompressionGatherMode == 1) {
if (GetProcessingSettings().tpcCompressionGatherMode == 1) {
GPUError("Double pipeline incompatible to compression mode 1");
return false;
}
Expand All @@ -318,7 +318,11 @@ bool GPUChainTracking::ValidateSettings()
return false;
}
}
if ((GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCCompression) && !(GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCCompression) && (ProcessingSettings().tpcCompressionGatherMode == 1 || ProcessingSettings().tpcCompressionGatherMode == 3)) {
if ((GetRecoSteps() & GPUDataTypes::RecoStep::TPCDecompression) && GetProcessingSettings().tpcApplyCFCutsAtDecoding && !GetProcessingSettings().tpcUseOldCPUDecoding) {
GPUError("tpcApplyCFCutsAtDecoding currently requires tpcUseOldCPUDecoding");
return false;
}
if ((GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCCompression) && !(GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCCompression) && (GetProcessingSettings().tpcCompressionGatherMode == 1 || GetProcessingSettings().tpcCompressionGatherMode == 3)) {
GPUError("Invalid tpcCompressionGatherMode for compression on CPU");
return false;
}
Expand Down Expand Up @@ -888,7 +892,7 @@ int32_t GPUChainTracking::RunChainFinalize()
if (GetProcessingSettings().eventDisplay->getDisplayControl() == 2) {
mDisplayRunning = false;
GetProcessingSettings().eventDisplay->DisplayExit();
ProcessingSettings().eventDisplay = nullptr;
const_cast<GPUSettingsProcessing&>(GetProcessingSettings()).eventDisplay = nullptr; // TODO: fixme - eventDisplay should probably not be put into ProcessingSettings in the first place
return (2);
}
GetProcessingSettings().eventDisplay->setDisplayControl(0);
Expand Down
68 changes: 51 additions & 17 deletions GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ int32_t GPUChainTracking::RunTPCCompression()
RecordMarker(mEvents->single, 0);
}

if (ProcessingSettings().tpcCompressionGatherMode == 3) {
if (GetProcessingSettings().tpcCompressionGatherMode == 3) {
mRec->AllocateVolatileDeviceMemory(0); // make future device memory allocation volatile
}
SetupGPUProcessor(&Compressor, true);
Expand Down Expand Up @@ -73,19 +73,19 @@ int32_t GPUChainTracking::RunTPCCompression()
Compressor.mOutputFlat->set(outputSize, *Compressor.mOutput);
char* hostFlatPtr = (char*)Compressor.mOutput->qTotU; // First array as allocated in GPUTPCCompression::SetPointersCompressedClusters
size_t copySize = 0;
if (ProcessingSettings().tpcCompressionGatherMode == 3) {
if (GetProcessingSettings().tpcCompressionGatherMode == 3) {
CompressorShadow.mOutputA = Compressor.mOutput;
copySize = AllocateRegisteredMemory(Compressor.mMemoryResOutputGPU); // We overwrite Compressor.mOutput with the allocated output pointers on the GPU
}
const o2::tpc::CompressedClustersPtrs* P = nullptr;
HighResTimer* gatherTimer = nullptr;
int32_t outputStream = 0;
if (ProcessingSettings().doublePipeline) {
if (GetProcessingSettings().doublePipeline) {
SynchronizeStream(OutputStream()); // Synchronize output copies running in parallel from memory that might be released, only the following async copy from stacked memory is safe after the chain finishes.
outputStream = OutputStream();
}
if (ProcessingSettings().tpcCompressionGatherMode >= 2) {
if (ProcessingSettings().tpcCompressionGatherMode == 2) {
if (GetProcessingSettings().tpcCompressionGatherMode >= 2) {
if (GetProcessingSettings().tpcCompressionGatherMode == 2) {
void* devicePtr = mRec->getGPUPointer(Compressor.mOutputFlat);
if (devicePtr != Compressor.mOutputFlat) {
CompressedClustersPtrs& ptrs = *Compressor.mOutput; // We need to update the ptrs with the gpu-mapped version of the host address space
Expand All @@ -97,7 +97,7 @@ int32_t GPUChainTracking::RunTPCCompression()
TransferMemoryResourcesToGPU(myStep, &Compressor, outputStream);
constexpr uint32_t nBlocksDefault = 2;
constexpr uint32_t nBlocksMulti = 1 + 2 * 200;
switch (ProcessingSettings().tpcCompressionGatherModeKernel) {
switch (GetProcessingSettings().tpcCompressionGatherModeKernel) {
case 0:
runKernel<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::unbuffered>(GetGridBlkStep(nBlocksDefault, outputStream, RecoStep::TPCCompression));
getKernelTimer<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::unbuffered>(RecoStep::TPCCompression, 0, outputSize, false);
Expand All @@ -120,10 +120,10 @@ int32_t GPUChainTracking::RunTPCCompression()
getKernelTimer<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::multiBlock>(RecoStep::TPCCompression, 0, outputSize, false);
break;
default:
GPUError("Invalid compression kernel %d selected.", (int32_t)ProcessingSettings().tpcCompressionGatherModeKernel);
GPUError("Invalid compression kernel %d selected.", (int32_t)GetProcessingSettings().tpcCompressionGatherModeKernel);
return 1;
}
if (ProcessingSettings().tpcCompressionGatherMode == 3) {
if (GetProcessingSettings().tpcCompressionGatherMode == 3) {
RecordMarker(mEvents->stream[outputStream], outputStream);
char* deviceFlatPts = (char*)Compressor.mOutput->qTotU;
if (GetProcessingSettings().doublePipeline) {
Expand All @@ -138,9 +138,9 @@ int32_t GPUChainTracking::RunTPCCompression()
}
} else {
int8_t direction = 0;
if (ProcessingSettings().tpcCompressionGatherMode == 0) {
if (GetProcessingSettings().tpcCompressionGatherMode == 0) {
P = &CompressorShadow.mPtrs;
} else if (ProcessingSettings().tpcCompressionGatherMode == 1) {
} else if (GetProcessingSettings().tpcCompressionGatherMode == 1) {
P = &Compressor.mPtrs;
direction = -1;
gatherTimer = &getTimer<GPUTPCCompressionKernels>("GPUTPCCompression_GatherOnCPU", 0);
Expand Down Expand Up @@ -184,11 +184,11 @@ int32_t GPUChainTracking::RunTPCCompression()
GPUMemCpyAlways(myStep, O->timeA, P->timeA, O->nTracks * sizeof(O->timeA[0]), outputStream, direction);
GPUMemCpyAlways(myStep, O->padA, P->padA, O->nTracks * sizeof(O->padA[0]), outputStream, direction);
}
if (ProcessingSettings().tpcCompressionGatherMode == 1) {
if (GetProcessingSettings().tpcCompressionGatherMode == 1) {
gatherTimer->Stop();
}
mIOPtrs.tpcCompressedClusters = Compressor.mOutputFlat;
if (ProcessingSettings().tpcCompressionGatherMode == 3) {
if (GetProcessingSettings().tpcCompressionGatherMode == 3) {
SynchronizeEventAndRelease(mEvents->stream[outputStream]);
mRec->ReturnVolatileDeviceMemory();
}
Expand All @@ -209,18 +209,52 @@ int32_t GPUChainTracking::RunTPCDecompression()
if (GetProcessingSettings().tpcUseOldCPUDecoding) {
const auto& threadContext = GetThreadContext();
TPCClusterDecompressor decomp;
auto allocator = [this](size_t size) {
auto allocatorFinal = [this](size_t size) {
this->mInputsHost->mNClusterNative = this->mInputsShadow->mNClusterNative = size;
this->AllocateRegisteredMemory(this->mInputsHost->mResourceClusterNativeOutput, this->mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
return this->mInputsHost->mPclusterNativeOutput;
};
auto& gatherTimer = getTimer<TPCClusterDecompressor>("TPCDecompression", 0);
gatherTimer.Start();
if (decomp.decompress(mIOPtrs.tpcCompressedClusters, *mClusterNativeAccess, allocator, param(), GetProcessingSettings().deterministicGPUReconstruction)) {
std::unique_ptr<ClusterNative[]> tmpBuffer;
auto allocatorTmp = [&tmpBuffer](size_t size) {
return ((tmpBuffer = std::make_unique<ClusterNative[]>(size))).get();
};
auto& decompressTimer = getTimer<TPCClusterDecompressor>("TPCDecompression", 0);
auto allocatorUse = GetProcessingSettings().tpcApplyCFCutsAtDecoding ? std::function<ClusterNative*(size_t)>{allocatorTmp} : std::function<ClusterNative*(size_t)>{allocatorFinal};
decompressTimer.Start();
if (decomp.decompress(mIOPtrs.tpcCompressedClusters, *mClusterNativeAccess, allocatorUse, param(), GetProcessingSettings().deterministicGPUReconstruction)) {
GPUError("Error decompressing clusters");
return 1;
}
gatherTimer.Stop();
if (GetProcessingSettings().tpcApplyCFCutsAtDecoding) {
ClusterNative* outputBuffer;
for (int32_t iPhase = 0; iPhase < 2; iPhase++) {
uint32_t countTotal = 0;
for (uint32_t iSector = 0; iSector < GPUCA_NSLICES; iSector++) {
for (uint32_t iRow = 0; iRow < GPUCA_ROW_COUNT; iRow++) {
uint32_t count = 0;
for (uint32_t k = 0; k < mClusterNativeAccess->nClusters[iSector][iRow]; k++) {
const ClusterNative& cl = mClusterNativeAccess->clusters[iSector][iRow][k];
bool keep = cl.qTot > param().rec.tpc.cfQTotCutoff && cl.qMax > param().rec.tpc.cfQMaxCutoff && (cl.sigmaPadPacked || cl.qMax > param().rec.tpc.cfQMaxCutoffSinglePad) && (cl.sigmaTimePacked || cl.qMax > param().rec.tpc.cfQMaxCutoffSingleTime);
count += keep;
countTotal += keep;
if (iPhase) {
outputBuffer[countTotal] = cl;
}
}
if (iPhase) {
mClusterNativeAccess->nClusters[iSector][iRow] = count;
}
}
}
if (iPhase) {
mClusterNativeAccess->clustersLinear = outputBuffer;
mClusterNativeAccess->setOffsetPtrs();
} else {
outputBuffer = allocatorFinal(countTotal);
}
}
}
decompressTimer.Stop();
mIOPtrs.clustersNative = mClusterNativeAccess.get();
if (mRec->IsGPU()) {
AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
Expand Down
4 changes: 2 additions & 2 deletions GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ void GPUChainTracking::PrintOutputStat()
{
int32_t nTracks = 0, nAttachedClusters = 0, nAttachedClustersFitted = 0, nAdjacentClusters = 0;
uint32_t nCls = GetProcessingSettings().doublePipeline ? mIOPtrs.clustersNative->nClustersTotal : GetTPCMerger().NMaxClusters();
if (ProcessingSettings().createO2Output > 1) {
if (GetProcessingSettings().createO2Output > 1) {
nTracks = mIOPtrs.nOutputTracksTPCO2;
nAttachedClusters = mIOPtrs.nMergedTrackHits;
} else {
Expand Down Expand Up @@ -244,7 +244,7 @@ void GPUChainTracking::PrintOutputStat()
}
snprintf(trdText, 1024, " - TRD Tracker reconstructed %d tracks (%d tracklets)", nTRDTracks, nTRDTracklets);
}
GPUInfo("Output Tracks: %d (%d / %d / %d / %d clusters (fitted / attached / adjacent / total) - %s format)%s", nTracks, nAttachedClustersFitted, nAttachedClusters, nAdjacentClusters, nCls, ProcessingSettings().createO2Output > 1 ? "O2" : "GPU", trdText);
GPUInfo("Output Tracks: %d (%d / %d / %d / %d clusters (fitted / attached / adjacent / total) - %s format)%s", nTracks, nAttachedClustersFitted, nAttachedClusters, nAdjacentClusters, nCls, GetProcessingSettings().createO2Output > 1 ? "O2" : "GPU", trdText);
}

void GPUChainTracking::SanityCheck()
Expand Down

0 comments on commit d56183e

Please sign in to comment.