diff --git a/GPU/Common/GPUCommonAlgorithm.h b/GPU/Common/GPUCommonAlgorithm.h index db57e7ec06d4b..be88973561e0a 100644 --- a/GPU/Common/GPUCommonAlgorithm.h +++ b/GPU/Common/GPUCommonAlgorithm.h @@ -354,6 +354,11 @@ GPUdi() uint8_t warp_broadcast_FUNC(uint8_t v, int32_t i) #define warp_scan_inclusive_add(v) warp_scan_inclusive_add_FUNC(v) #define warp_broadcast(v, i) warp_broadcast_FUNC(v, i) +[[nodiscard]] GPUdi() int32_t work_group_count(bool pred) +{ + return work_group_reduce_add((int32_t)pred); +} + #elif (defined(__CUDACC__) || defined(__HIPCC__)) // CUDA and HIP work the same way using cub, need just different header @@ -416,6 +421,16 @@ GPUdi() T warp_broadcast_FUNC(T v, int32_t i) #endif } +[[nodiscard]] GPUdi() bool work_group_any(bool pred) +{ + return __syncthreads_or(pred); +} + +[[nodiscard]] GPUdi() uint32_t work_group_count(bool pred) +{ + return __syncthreads_count(pred); +} + #else // Trivial implementation for the CPU @@ -449,6 +464,16 @@ GPUdi() T warp_broadcast(T v, int32_t i) return v; } +[[nodiscard]] GPUdi() bool work_group_any(bool pred) +{ + return pred; +} + +[[nodiscard]] GPUdi() uint32_t work_group_count(bool pred) +{ + return pred; +} + #endif #ifdef GPUCA_ALGORITHM_STD diff --git a/GPU/GPUTracking/DataTypes/GPUTPCExtraADC.h b/GPU/GPUTracking/DataTypes/GPUTPCExtraADC.h new file mode 100644 index 0000000000000..64fc5da833169 --- /dev/null +++ b/GPU/GPUTracking/DataTypes/GPUTPCExtraADC.h @@ -0,0 +1,25 @@ +// Copyright 2019-2026 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUTPCExtraADC.h +/// \author Felix Weiglhofer + +#include "GPUDefConstantsAndSettings.h" +#include "DataFormatsTPC/Digit.h" +#include +#include + +namespace o2::gpu +{ +struct GPUTPCExtraADC { + std::array, tpc::constants::MAXSECTOR> digitsBySector; +}; +} // namespace o2::gpu diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 43a5f4f79abdc..eb7d67a913ceb 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -112,6 +112,9 @@ AddOptionRTC(trackletMinSharedNormFactor, float, 0.f, "", 0, "Max shared defined AddOptionRTC(maxTimeBinAboveThresholdIn1000Bin, uint16_t, 500, "", 0, "Except pad from cluster finding if total number of charges in a fragment is above this baseline (disable = 0)") AddOptionRTC(maxConsecTimeBinAboveThreshold, uint16_t, 200, "", 0, "Except pad from cluster finding if number of consecutive charges in a fragment is above this baseline (disable = 0)") AddOptionRTC(noisyPadSaturationThreshold, uint16_t, 700, "", 0, "Threshold where a timebin is considered saturated, disabling the noisy pad check for that pad") +AddOptionRTC(hipTailFilter, uint8_t, 0, "", 0, "Enable Highly Ionising Particle tail filter in CheckPadBaseline (0 = disable, 1 = filter tails)") +AddOptionRTC(hipTailFilterThreshold, uint16_t, 100, "", 0, "Threshold that must be exceeded for a timebin to be counted towards Highly Ionising Particle tail") +AddOptionRTC(hipTailFilterAlpha, float, 0.5f, "", 0, "Smoothing factor for the exponential Highly Ionising Particle tail filter") AddOptionRTC(occupancyMapTimeBins, uint16_t, 16, "", 0, "Number of timebins per histogram bin of occupancy map (0 = disable occupancy map)") AddOptionRTC(occupancyMapTimeBinsAverage, uint16_t, 0, "", 0, "Number of timebins +/- to use for the averaging") AddOptionRTC(trackFitCovLimit, uint16_t, 1000, "", 0, "Abort fit when y/z cov exceed the limit") diff --git a/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv b/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv index ef215ba5ca870..823a70b24565b 100644 --- a/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv +++ b/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv @@ -60,6 +60,8 @@ GPUTPCGMO2Output_output,256,,,,,,,,,,,,,,,256 GPUTPCStartHitsFinder,256,,"[1024, 2]","[1024, 7]",256,256,256,256,256,512,512,512,,,,608 GPUTPCStartHitsSorter,256,,"[1024, 5]","[512, 7]",256,256,256,256,256,"[512, 1]","[512, 1]","[512, 1]",,,,608 GPUTPCCFCheckPadBaseline,576,,"[576, 2]","[576, 2]",,,,,,"[576, 2]",,,,,,"[576, 2]" +GPUTPCCFHIPTailConnector,256,,256,256,,,,,,256, +GPUTPCCFHIPClusterizer,256,,256,256,,,,,,256, GPUTPCCFChargeMapFiller_fillIndexMap,512,,512,512,,,,,,448,,,,,,448 GPUTPCCFChargeMapFiller_fillFromDigits,512,,512,512,,,,,,448,,,,,,448 GPUTPCCFChargeMapFiller_findFragmentStart,512,,512,512,,,,,,448,,,,,,448 diff --git a/GPU/GPUTracking/Global/GPUChainTracking.h b/GPU/GPUTracking/Global/GPUChainTracking.h index 9913762ae34df..78a43856f00f1 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.h +++ b/GPU/GPUTracking/Global/GPUChainTracking.h @@ -71,6 +71,7 @@ struct CfFragment; class GPUTPCClusterFinder; struct GPUSettingsProcessing; struct GPUSettingsRec; +struct GPUTPCExtraADC; class GPUChainTracking : public GPUChain { @@ -298,13 +299,15 @@ class GPUChainTracking : public GPUChain int32_t RunChainFinalize(); void OutputSanityCheck(); int32_t RunTPCTrackingSectors_internal(); - int32_t RunTPCClusterizer_prepare(bool restorePointers); + int32_t RunTPCClusterizer_prepare(bool restorePointers, const GPUTPCExtraADC& extraADCs); #ifndef GPUCA_RUN2 - std::pair RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment& fragment, int32_t lane); + std::pair RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment& fragment, int32_t lane, const GPUTPCExtraADC& extraADCs); void RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int32_t stage, bool doGPU, int32_t lane); std::pair TPCClusterizerDecodeZSCount(uint32_t iSector, const CfFragment& fragment); std::pair TPCClusterizerDecodeZSCountUpdate(uint32_t iSector, const CfFragment& fragment); void TPCClusterizerEnsureZSOffsets(uint32_t iSector, const CfFragment& fragment); + void TPCClusterizerTransferExtraADC(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int lane, const GPUTPCExtraADC& extraADCs); + void TPCClusterizerCheckExtraADCZeros(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int lane, const GPUTPCExtraADC& extraADCs); #endif void RunTPCTrackingMerger_MergeBorderTracks(uint8_t mergeMode, GPUReconstruction::krnlDeviceType deviceType); void RunTPCTrackingMerger_Resolve(int8_t useOrigTrackParam, int8_t mergeAll, GPUReconstruction::krnlDeviceType deviceType); diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index a2a07be7832ca..4609c9cd59bed 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -17,6 +17,7 @@ #include "GPUChainTrackingDebug.h" #include "GPULogging.h" #include "GPUO2DataTypes.h" +#include "GPUTPCExtraADC.h" #include "GPUMemorySizeScalers.h" #include "GPUTrackingInputProvider.h" #include "GPUNewCalibValues.h" @@ -56,10 +57,13 @@ #include "utils/VcShim.h" #include "utils/strtag.h" -#include +#include "utils/vecpod.h" #include +#include #include +// #define INSERT_SATURATED_SIGNALS + using namespace o2::gpu; using namespace o2::tpc; using namespace o2::tpc::constants; @@ -155,11 +159,171 @@ void GPUChainTracking::TPCClusterizerEnsureZSOffsets(uint32_t iSector, const CfF } } +void GPUChainTracking::TPCClusterizerTransferExtraADC(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int lane, const GPUTPCExtraADC& extraADCs) +{ + const int32_t iSector = clusterer.mISector; + const auto& fragment = clusterer.mPmemory->fragment; + const auto& digits = extraADCs.digitsBySector[iSector]; + + if (fragment.index != 0) { + return; + } + + if (digits.empty()) { + return; + } + + const size_t chargeMapSize = TPCMapMemoryLayout::items(GetProcessingSettings().overrideClusterizerFragmentLen); + const size_t chargeMapSizeBytes = chargeMapSize * sizeof(PackedCharge); + + vecpod chargeMapHostData; + chargeMapHostData.resize(chargeMapSize); + + CfArray2D chargeMapHost(reinterpret_cast(chargeMapHostData.data())); + + vecpod extraPositions; + extraPositions.reserve(digits.size()); + + GPUMemCpy(RecoStep::TPCClusterFinding, chargeMapHostData.data(), clustererShadow.mPchargeMap, chargeMapSizeBytes, lane, false); + SynchronizeStream(lane); + + for (const auto& d : digits) { + if (!fragment.contains(d.getTimeStamp())) { + continue; + } + + CfChargePos pos{(tpccf::Row)d.getRow(), (tpccf::Pad)d.getPad(), (tpccf::TPCFragmentTime)(d.getTimeStamp() - fragment.start)}; + chargeMapHost[pos] = PackedCharge(d.getChargeFloat()); + + extraPositions.push_back(pos); + } + + GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPchargeMap, chargeMapHostData.data(), chargeMapSizeBytes, lane, true); + + const size_t nPositions = clusterer.mPmemory->counters.nPositions; + const size_t extraPositionsOffset = nPositions - extraPositions.size(); + GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPpositions + extraPositionsOffset, extraPositions.data(), extraPositions.size() * sizeof(CfChargePos), lane, true); +} + +void GPUChainTracking::TPCClusterizerCheckExtraADCZeros(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int lane, const GPUTPCExtraADC& extraADCs) +{ + const int32_t iSector = clusterer.mISector; + const auto& fragment = clusterer.mPmemory->fragment; + const auto& digits = extraADCs.digitsBySector[iSector]; + + if (fragment.index != 0) { + return; + } + + if (digits.empty()) { + return; + } + + const size_t chargeMapSize = TPCMapMemoryLayout::items(GetProcessingSettings().overrideClusterizerFragmentLen); + const size_t chargeMapSizeBytes = chargeMapSize * sizeof(PackedCharge); + + vecpod chargeMapHostData; + chargeMapHostData.resize(chargeMapSize); + + CfArray2D chargeMapHost(reinterpret_cast(chargeMapHostData.data())); + + GPUMemCpy(RecoStep::TPCClusterFinding, chargeMapHostData.data(), clustererShadow.mPchargeMap, chargeMapSizeBytes, lane, false); + SynchronizeStream(lane); + + size_t nNonZeroADCs = 0; + + for (const auto& d : digits) { + if (!fragment.contains(d.getTimeStamp())) { + continue; + } + + CfChargePos pos{(tpccf::Row)d.getRow(), (tpccf::Pad)d.getPad(), (tpccf::TPCFragmentTime)(d.getTimeStamp() - fragment.start)}; + + auto adc = chargeMapHost[pos].unpack(); + + if (adc != 0) { + nNonZeroADCs++; + } + } + + if (nNonZeroADCs > 0) { + GPUInfo("Non Zero ADCs: %zu", nNonZeroADCs); + } else { + GPUInfo("Cleared all extra ADC values!", nNonZeroADCs); + } +} + namespace { struct TPCCFDecodeScanTmp { int32_t zsPtrFirst, zsPageFirst, zsPtrLast, zsPageLast, hasData, pageCounter; }; + +// Additional ADC values must be generated at start of clusterizer +// This is required, so enough memory is allocated for the charge points +// And ADCs can be injected by "simply" +// -> copying chargeMap + chargePositions to host +// -> writing additional adcs to chargeMap + positions +// -> copying values to device +GPUTPCExtraADC GenerateSaturatedSignals(size_t seed = 42) +{ + constexpr int32_t MinTailLength = 50; + constexpr int32_t MaxTailLength = 200; + constexpr int32_t TailWidth = 3; // Assume tails are 3 pads wide at the moment + + constexpr GPUTPCGeometry geo; + + GPUTPCExtraADC adcs; + + const int32_t nHIPs = 50; + const int32_t firstTB = 0; // Place all HIPs in first fragment for now + const int32_t lastTB = 4000 - MaxTailLength; // Don't allow cut off tails at fragment borders + const int32_t tailADC = 250; // charge should decrease over time, but for now just hardcode ADC above the threshold + + std::mt19937 gen{seed}; + std::uniform_int_distribution<> randomRow(0, GPUTPCGeometry::NROWS - 1); + std::uniform_int_distribution<> randomTB(firstTB, lastTB); + std::uniform_int_distribution<> randomTailLength(MinTailLength, MaxTailLength); + // std::normal_distribution<> tailLengthNoise(8, 2.0); + + for (int32_t iHIP = 0; iHIP < nHIPs; iHIP++) { + + const int32_t row = randomRow(gen); + const int32_t nPads = geo.NPads(row); + std::uniform_int_distribution<> randomPad(0, nPads - 1); + + const int32_t basePad = randomPad(gen); + const int32_t baseTb = randomTB(gen); + + auto& digits = adcs.digitsBySector[0]; + + const int32_t tailLength = randomTailLength(gen); + + for (int32_t dPad = -TailWidth; dPad <= TailWidth; dPad++) { + const int32_t iPad = basePad + dPad; + if (iPad < 0 || iPad >= nPads) { + continue; + } + + for (int32_t dTime = 0; dTime < tailLength; dTime++) { + const int32_t iTime = baseTb + dTime; + + if (iTime >= 4000) { + break; + } + + const auto adc = dTime == 0 && dPad == 0 ? 1023 : tailADC; + + digits.emplace_back(0, adc, row, iPad, iTime); + } + } + } + + GPUInfo("Generated %zu ADCs!", adcs.digitsBySector[0].size()); + + return adcs; +} + } // namespace std::pair GPUChainTracking::TPCClusterizerDecodeZSCount(uint32_t iSector, const CfFragment& fragment) @@ -437,13 +601,16 @@ void GPUChainTracking::RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder& clust } } -std::pair GPUChainTracking::RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment& fragment, int32_t lane) +std::pair GPUChainTracking::RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment& fragment, int32_t lane, const GPUTPCExtraADC& extraADCs) { bool doGPU = GetRecoStepsGPU() & RecoStep::TPCClusterFinding; if (mCFContext->abandonTimeframe) { return {0, 0}; } - const auto& retVal = TPCClusterizerDecodeZSCountUpdate(iSector, fragment); + auto retVal = TPCClusterizerDecodeZSCountUpdate(iSector, fragment); + if (fragment.index == 0) { + retVal.first += extraADCs.digitsBySector[iSector].size(); + } if (doGPU) { GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector]; GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer; @@ -473,7 +640,7 @@ std::pair GPUChainTracking::RunTPCClusterizer_transferZS(int return retVal; } -int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers) +int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers, const GPUTPCExtraADC& extraADCs) { bool doGPU = mRec->GetRecoStepsGPU() & gpudatatypes::RecoStep::TPCClusterFinding; if (restorePointers) { @@ -569,7 +736,7 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers) mCFContext->fragmentFirst = CfFragment{std::max(mCFContext->tpcMaxTimeBin + 1, maxFragmentLen), maxFragmentLen}; for (int32_t iSector = 0; iSector < GetProcessingSettings().nTPCClustererLanes && iSector < NSECTORS; iSector++) { if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) { - mCFContext->nextPos[iSector] = RunTPCClusterizer_transferZS(iSector, mCFContext->fragmentFirst, GetProcessingSettings().nTPCClustererLanes + iSector); + mCFContext->nextPos[iSector] = RunTPCClusterizer_transferZS(iSector, mCFContext->fragmentFirst, GetProcessingSettings().nTPCClustererLanes + iSector, extraADCs); } } @@ -595,7 +762,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) mRec->PushNonPersistentMemory(qStr2Tag("TPCCLUST")); const auto& threadContext = GetThreadContext(); const bool doGPU = GetRecoStepsGPU() & RecoStep::TPCClusterFinding; - if (RunTPCClusterizer_prepare(mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer)) { + + GPUTPCExtraADC extraADCs; +#ifdef INSERT_SATURATED_SIGNALS + extraADCs = GenerateSaturatedSignals(); +#endif + + if (RunTPCClusterizer_prepare(mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer, extraADCs)) { return 1; } if (GetProcessingSettings().autoAdjustHostThreads && !doGPU) { @@ -625,7 +798,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) SetupGPUProcessor(&processors()->tpcClusterer[iSector], true); // Now we allocate } if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) { - RunTPCClusterizer_prepare(true); // Restore some pointers, allocated by the other pipeline, and set to 0 by SetupGPUProcessor (since not allocated in this pipeline) + RunTPCClusterizer_prepare(true, extraADCs); // Restore some pointers, allocated by the other pipeline, and set to 0 by SetupGPUProcessor (since not allocated in this pipeline) } if (doGPU && mIOPtrs.tpcZS) { @@ -769,6 +942,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) const bool propagateMCLabels = buildNativeHost && GetProcessingSettings().runMC && processors()->ioPtrs.tpcPackedDigits && processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC; const bool sortClusters = buildNativeHost && (GetProcessingSettings().deterministicGPUReconstruction || GetProcessingSettings().debugLevel >= 4); + if (GetProcessingSettings().runMC && (!processors()->ioPtrs.tpcPackedDigits || !processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC)) { + GPUWarning("Requested to process MC labels, but no labels present"); + } + auto* digitsMC = propagateMCLabels ? processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC : nullptr; mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = mRec->MemoryScalers()->nTPCHits * tpcHitLowOccupancyScalingFactor; @@ -938,7 +1115,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) f = mCFContext->fragmentFirst; } if (nextSector < NSECTORS && mIOPtrs.tpcZS && mCFContext->nPagesSector[nextSector] && mCFContext->zsVersion != -1 && !mCFContext->abandonTimeframe) { - mCFContext->nextPos[nextSector] = RunTPCClusterizer_transferZS(nextSector, f, GetProcessingSettings().nTPCClustererLanes + lane); + mCFContext->nextPos[nextSector] = RunTPCClusterizer_transferZS(nextSector, f, GetProcessingSettings().nTPCClustererLanes + lane, extraADCs); } } GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector]; @@ -949,9 +1126,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (!mIOPtrs.tpcZS) { runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}); } - if (DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererDigits, clusterer, &GPUTPCClusterFinder::DumpDigits, *mDebugFile)) { - clusterer.DumpChargeMap(*mDebugFile, "Charges"); - } + + TPCClusterizerTransferExtraADC(clusterer, clustererShadow, lane, extraADCs); if (propagateMCLabels) { runKernel({GetGrid(clusterer.mPmemory->counters.nDigitsInFragment, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}); @@ -960,14 +1136,33 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) bool checkForNoisyPads = (rec()->GetParam().rec.tpc.maxTimeBinAboveThresholdIn1000Bin > 0) || (rec()->GetParam().rec.tpc.maxConsecTimeBinAboveThreshold > 0); checkForNoisyPads &= (rec()->GetParam().rec.tpc.noisyPadsQuickCheck ? fragment.index == 0 : true); checkForNoisyPads &= !GetProcessingSettings().disableTPCNoisyPadFilter; + // TODO Move hipTailFilter flag to ProcessingSettings? + // TODO Add some warning when re enabling pad filter with this flag, so it's not just silently enabled when disabling was requested + checkForNoisyPads |= rec()->GetParam().rec.tpc.hipTailFilter; + + if (rec()->GetParam().rec.tpc.hipTailFilter && !doGPU) { + GPUError("HIP tail filter enabled, but this is currently not supported on CPU"); + } if (checkForNoisyPads) { + if (rec()->GetParam().rec.tpc.hipTailFilter) { + runKernel({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPhipTailsByRow, GPUTPCGeometry::NROWS * sizeof(*clustererShadow.mPhipTailsByRow) * GPUTPCCFHIPClusterizer::MaxHIPTailsPerRow); + runKernel({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPnHIPTails, GPUTPCGeometry::NROWS * sizeof(*clustererShadow.mPnHIPTails)); + } const int32_t nBlocks = GPUTPCCFCheckPadBaseline::GetNBlocks(doGPU); runKernel({GetGridBlk(nBlocks, lane), {iSector}}); getKernelTimer(RecoStep::TPCClusterFinding, iSector, TPC_REAL_PADS_IN_SECTOR * fragment.lengthWithoutOverlap() * sizeof(PackedCharge), false); } + DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererDigits, clusterer, &GPUTPCClusterFinder::DumpDigits, *mDebugFile); + // Avoid additional sync when also dumping digits + const bool debugSyncChargeMap = !(GetProcessingSettings().debugMask & GPUChainTrackingDebugFlags::TPCClustererDigits); + // DumpChargeMap should run after noisy pad filter to avoid yet another dump of intermediate data. When chargemap without pad filter is required, disable pad filter instead. + DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, debugSyncChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Charges"); + + TPCClusterizerCheckExtraADCZeros(clusterer, clustererShadow, lane, extraADCs); + runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}); if (DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererPeaks, clusterer, &GPUTPCClusterFinder::DumpPeaks, *mDebugFile)) { clusterer.DumpPeakMap(*mDebugFile, "Peaks"); @@ -1002,6 +1197,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector]; GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer; + if (clusterer.mPmemory->counters.nPositions == 0) { + return; + } + if (doGPU) { SynchronizeStream(lane); } @@ -1015,11 +1214,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) runKernel({GetGridAutoStep(lane, RecoStep::TPCClusterFinding), krnlRunRangeNone, {nullptr, waitEvent}}, clustererShadow.mPclusterInRow, GPUTPCGeometry::NROWS * sizeof(*clustererShadow.mPclusterInRow)); } - if (clusterer.mPmemory->counters.nClusters == 0) { - return; - } - - if (GetProcessingSettings().nn.applyNNclusterizer) { + const auto nRegularClusters = clusterer.mPmemory->counters.nClusters; + if (nRegularClusters != 0) { + if (GetProcessingSettings().nn.applyNNclusterizer) { #ifdef GPUCA_HAS_ONNX GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[lane]; GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN; @@ -1144,7 +1341,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if(!nn_settings.nnClusterizerApplyCfDeconvolution) { // If it is already applied don't do it twice, otherwise apply now runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, true); } - DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); + DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMapSplit, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, 0); // Running the CF regression kernel - no batching needed: batchStart = 0 if (nn_settings.nnClusterizerVerbosity > 3) { LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with CF regression. (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")"; @@ -1155,16 +1352,31 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) #endif } else { runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, true); - DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); + DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMapSplit, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSector}}, 0); } - if (doGPU && propagateMCLabels) { - TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane); - if (doGPU) { + if (doGPU && propagateMCLabels) { + TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane); + if (doGPU) { + SynchronizeStream(lane); + } + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, 1); // Computes MC labels + } + } + + // TODO: Move this right after CheckPadBaseline once tail zeroing is moved into this kernel. + if (rec()->GetParam().rec.tpc.hipTailFilter) { + runKernel({GetGridBlk(GPUTPCGeometry::NROWS, lane), {iSector}}); + runKernel({GetGridBlk(GPUTPCGeometry::NROWS, lane), {iSector}}); + if (doGPU && (nRegularClusters == 0 || GetProcessingSettings().debugLevel >= 3)) { + TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane); SynchronizeStream(lane); } - runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, 1); // Computes MC labels + } + + if (clusterer.mPmemory->counters.nClusters == 0) { + return; } if (GetProcessingSettings().debugLevel >= 3) { @@ -1173,8 +1385,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) TransferMemoryResourcesToHost(RecoStep::TPCClusterFinding, &clusterer, lane); laneHasData[lane] = true; - // Include clusters in default debug mask, exclude other debug output by default - DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererClusters, clusterer, &GPUTPCClusterFinder::DumpClusters, *mDebugFile); // clang-format off }); mRec->SetNActiveThreadsOuterLoop(1); } @@ -1192,6 +1402,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (laneHasData[lane]) { anyLaneHasData = true; + // Include clusters in default debug mask, exclude other debug output by default. + // The cluster buffers are accumulated per sector, so dump them once after all fragments. + DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererClusters, clusterer, &GPUTPCClusterFinder::DumpClusters, *mDebugFile); // clang-format off if (buildNativeGPU && GetProcessingSettings().tpccfGatherKernel) { runKernel({GetGridBlk(GPUTPCGeometry::NROWS, mRec->NStreams() - 1), {iSector}}, &mInputsShadow->mPclusterNativeBuffer[nClsTotal]); } diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index dda15d403407e..6ca50f24351c4 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -16,6 +16,7 @@ #include "GPUChainTrackingDebug.h" #include "GPULogging.h" #include "GPUO2DataTypes.h" +#include "GPUTPCExtraADC.h" #include "GPUTrackingInputProvider.h" #include "GPUTPCCFChainContext.h" #include "TPCClusterDecompressor.h" @@ -63,7 +64,7 @@ int32_t GPUChainTracking::RunTPCCompression() if (mPipelineFinalizationCtx && GetProcessingSettings().doublePipelineClusterizer) { SynchronizeEventAndRelease(mEvents->single); auto* foreignChain = (GPUChainTracking*)GetNextChainInQueue(); - foreignChain->RunTPCClusterizer_prepare(false); + foreignChain->RunTPCClusterizer_prepare(false, {}); foreignChain->mCFContext->ptrClusterNativeSave = processorsShadow()->ioPtrs.clustersNative; } #endif diff --git a/GPU/GPUTracking/Global/GPUChainTrackingDebug.h b/GPU/GPUTracking/Global/GPUChainTrackingDebug.h index a0be9d833d5a9..39e48f9f14d9a 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingDebug.h +++ b/GPU/GPUTracking/Global/GPUChainTrackingDebug.h @@ -45,7 +45,8 @@ enum GPUChainTrackingDebugFlags : uint32_t { TPCClustererPeaks = 1 << 19, TPCClustererSuppressedPeaks = 1 << 20, TPCClustererChargeMap = 1 << 21, - TPCClustererZeroedCharges = 1 << 22 + TPCClustererChargeMapSplit = 1 << 22, + TPCClustererZeroedCharges = 1 << 23 }; template diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx index 204d9d6a8b81a..3c05284d3b876 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx @@ -16,14 +16,187 @@ #include "CfArray2D.h" #include "PackedCharge.h" #include "clusterFinderDefs.h" +#include "DataFormatsTPC/ClusterNative.h" #ifndef GPUCA_GPUCODE #include "utils/VcShim.h" #endif +#if 0 +#define DPRINT(...) printf(__VA_ARGS__) +#define DPRINTB(...) \ + if (iThread == 0) \ + printf(__VA_ARGS__) +#define DPRINTB_IF(test, ...) \ + if (iThread == 0 && (test)) \ + printf(__VA_ARGS__) +#else +#define DPRINT(...) ((void)0) +#define DPRINTB(...) ((void)0) +#define DPRINTB_IF(test, ...) ((void)0) +#endif + using namespace o2::gpu; using namespace o2::gpu::tpccf; +using Kernel = GPUTPCCFCheckPadBaseline; + +static GPUdi() HIPTailDescriptor* GetHIPTails(GPUTPCClusterFinder& clusterer, int32_t row) +{ + // HIP TAILS: indexing starts at 1, so 0 index indicates no connection + return clusterer.mPhipTailsByRow + row * GPUTPCCFHIPClusterizer::MaxHIPTailsPerRow; +} + +static GPUdi() Charge UpdateHIPTailFilter(Charge filteredCharge, Charge charge, Charge alpha) +{ + return filteredCharge + alpha * (charge - filteredCharge); +} + +static GPUdi() float HIPTailTimeMean(const HIPTailDescriptor& tail) +{ + const float length = tail.tailEnd > tail.tailStart ? float(tail.tailEnd - tail.tailStart) : 1.f; + return tail.tailStart + 0.5f * (length - 1.f); +} + +static GPUdi() float HIPTailTimeVariance(const HIPTailDescriptor& tail) +{ + const float length = tail.tailEnd > tail.tailStart ? float(tail.tailEnd - tail.tailStart) : 1.f; + return (length * length - 1.f) * (1.f / 12.f); +} + +// Collect tails marked for closing across the workgroup using a prefix scan, +// then cooperatively zero the charge map entries for each closed tail. +// Caller must set acc.activeHIPTail.end before calling if the tail is open. +static GPUdi() uint16_t CloseHIPTails( + Kernel::GPUSharedMemory& smem, + GPUTPCClusterFinder& clusterer, + int32_t iThread, int32_t nThreads, + int16_t iPadHandle, + CfChargePos basePos, + CfArray2D& chargeMap, + Kernel::PadChargeAccu& acc, + bool shouldCloseTail) +{ + const uint32_t row = basePos.row(); + const uint16_t nClosedTails = work_group_count(shouldCloseTail); + + auto* nHIPTails = clusterer.mPnHIPTails; + auto* hipTails = GetHIPTails(clusterer, row); + + if (nClosedTails > 0) { + int16_t iClosedTail = work_group_scan_inclusive_add((int16_t)shouldCloseTail) - 1; + const bool shouldStoreTail = shouldCloseTail && acc.activeHIPTail.Length() > 0; + uint16_t nStoredTails = work_group_count(shouldStoreTail); + int16_t iStoredTail = work_group_scan_inclusive_add((int16_t)shouldStoreTail) - 1; + + // Use exactly one atomic add per closing call to reduce differences in + // tail ordering between runs. + if (nStoredTails > 0) { + if (iThread == 0) { + smem.tailStoreBase = CAMath::AtomicAdd(&nHIPTails[row], (uint32_t)nStoredTails); + } + GPUbarrier(); + } + if (shouldCloseTail) { + smem.tailsClosedPad[iClosedTail] = iPadHandle; + smem.tailsClosed[iClosedTail] = acc.activeHIPTail; + smem.tailsClosedStoreIdx[iClosedTail] = GPUTPCCFHIPTailConnector::MaxHIPTailsPerRow; + + if (shouldStoreTail) { + const uint32_t idx = smem.tailStoreBase + iStoredTail + 1; + smem.tailsClosedStoreIdx[iClosedTail] = idx; + if (idx < GPUTPCCFHIPTailConnector::MaxHIPTailsPerRow) { + hipTails[idx] = {0, 0, (uint16_t)iPadHandle, + (uint16_t)acc.activeHIPTail.start, (uint16_t)acc.activeHIPTail.end, + 0.f, 0.f}; + } + } + + acc.tailFilterCharge = 0; + acc.activeHIPTail.Reset(); + } + + GPUbarrier(); + } + + // TODO: performance improvement -> parallelize this loop across tails + for (uint16_t iTail = 0; iTail < nClosedTails; iTail++) { + const auto tailPad = smem.tailsClosedPad[iTail]; + const auto tail = smem.tailsClosed[iTail]; + const uint32_t tailStoreIdx = smem.tailsClosedStoreIdx[iTail]; + + Charge qTot = 0.f; + Charge qMax = 0.f; + for (uint16_t iTime = iThread; iTime < tail.Length(); iTime += nThreads) { + const int16_t time = tail.start + iTime; + auto pos = basePos.delta({tailPad, time}); + const Charge q = chargeMap[pos].unpack(); + qTot += q; + qMax = CAMath::Max(qMax, q); + chargeMap[pos] = PackedCharge{0}; + } + + smem.tailQTotScratch[iThread] = qTot; + smem.tailQMaxScratch[iThread] = qMax; + GPUbarrier(); + for (uint16_t active = nThreads; active > 1;) { + const uint16_t stride = (active + 1) / 2; + if (iThread < active - stride) { + smem.tailQTotScratch[iThread] += smem.tailQTotScratch[iThread + stride]; + smem.tailQMaxScratch[iThread] = CAMath::Max(smem.tailQMaxScratch[iThread], smem.tailQMaxScratch[iThread + stride]); + } + active = stride; + GPUbarrier(); + } + + if (iThread == 0 && tailStoreIdx < GPUTPCCFHIPTailConnector::MaxHIPTailsPerRow) { + HIPTailDescriptor& tailDescriptor = hipTails[tailStoreIdx]; + tailDescriptor.qTot = smem.tailQTotScratch[0]; + tailDescriptor.qMax = smem.tailQMaxScratch[0]; + } + } + + return nClosedTails; +} + +template +static GPUdi() void ScanCachedCharges(Kernel::GPUSharedMemory& smem, uint16_t timeOffset, uint16_t pad, Charge hipTailThreshold, Charge hipTailFilterAlpha, Kernel::PadChargeAccu& acc) +{ + for (int32_t i = 0; i < Kernel::NumOfCachedTBs; i++) { + const Charge qs = smem.charges[i][pad]; + const int16_t curTB = timeOffset + i; + + acc.totalCharges += qs > 0; + acc.consecCharges = qs > 0 ? acc.consecCharges + 1 : 0; + acc.maxConsecCharges = CAMath::Max(acc.consecCharges, acc.maxConsecCharges); + acc.maxCharge = CAMath::Max(qs, acc.maxCharge); + + if (qs >= hipTailThreshold) { + if (acc.aboveThresholdStart < 0) { + acc.aboveThresholdStart = curTB; + } + } else { + acc.aboveThresholdStart = -1; + } + + if constexpr (CheckHIPTrigger) { + if (acc.HIPtb < 0 && qs >= Charge(Kernel::MaxADC)) { + acc.HIPtb = acc.aboveThresholdStart; // start of rising edge, not first sat TB + smem.tails[pad] = {acc.HIPtb, 0}; // Broadcast HIP start TB to neighboring pads / threads + } + } + + if constexpr (CheckHIPTailEnd) { + if (acc.activeHIPTail.IsOpen()) { + acc.tailFilterCharge = UpdateHIPTailFilter(acc.tailFilterCharge, qs, hipTailFilterAlpha); + if (acc.tailFilterCharge < hipTailThreshold) { + acc.activeHIPTail.end = curTB; + } + } + } + } +} + template <> GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer) { @@ -49,6 +222,9 @@ GPUd() void GPUTPCCFCheckPadBaseline::CheckBaselineGPU(int32_t nBlocks, int32_t } const CfFragment& fragment = clusterer.mPmemory->fragment; + const bool hipFilterOn = clusterer.Param().rec.tpc.hipTailFilter; + const Charge hipTailThreshold = clusterer.Param().rec.tpc.hipTailFilterThreshold; + const Charge hipTailFilterAlpha = clusterer.Param().rec.tpc.hipTailFilterAlpha; CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); constexpr GPUTPCGeometry geo; @@ -57,45 +233,133 @@ GPUd() void GPUTPCCFCheckPadBaseline::CheckBaselineGPU(int32_t nBlocks, int32_t const auto nPads = geo.NPads(iRow); const CfChargePos basePos{(Row)iRow, 0, 0}; - int32_t totalCharges = 0; - int32_t consecCharges = 0; - int32_t maxConsecCharges = 0; - Charge maxCharge = 0; + PadChargeAccu acc; const int16_t iPadOffset = iThread % MaxNPadsPerRow; const int16_t iTimeOffset = iThread / MaxNPadsPerRow; const int16_t iPadHandle = iThread; const bool handlePad = iPadHandle < nPads; - const auto firstTB = fragment.firstNonOverlapTimeBin(); - const auto lastTB = fragment.lastNonOverlapTimeBin(); + if (iPadHandle < MaxNPadsPerRow) { + smem.tails[iPadHandle] = {-1, -1}; + } + GPUbarrier(); - for (auto t = firstTB; t < lastTB; t += NumOfCachedTBs) { + // Pad filter scans the entire fragments including overlap. + // Minimal runtime overhead and prevents headaches later on as + // saturated signal in overlap region can create tails in the next fragment + // even when cleared in current fragment as they're decoded twice + const TPCFragmentTime firstTB = 0; + const TPCFragmentTime lastTB = fragment.length; - const TPCFragmentTime iTime = t + iTimeOffset; + for (uint16_t t = firstTB; t < lastTB; t += NumOfCachedTBs) { - const CfChargePos pos = basePos.delta({iPadOffset, iTime}); + bool thisThreadHasTrigger = false; + for (uint16_t tt = 0; tt < NumOfCachedTBs; tt += TimebinsPerCacheline) { + const TPCFragmentTime iTimeLoad = t + tt + iTimeOffset; - smem.charges[iTimeOffset][iPadOffset] = iTime < lastTB && iPadOffset < nPads ? chargeMap[pos].unpack() : 0; + const CfChargePos pos = basePos.delta({iPadOffset, iTimeLoad}); - GPUbarrier(); + const Charge ql = iTimeLoad < lastTB && iPadOffset < nPads ? chargeMap[pos].unpack() : 0; + smem.charges[tt + iTimeOffset][iPadOffset] = ql; + + thisThreadHasTrigger |= ql >= Charge(MaxADC); + } + + const bool hasHIPTrigger = hipFilterOn && work_group_any(thisThreadHasTrigger); + + acc.HIPtb = -1; if (handlePad) { - for (int32_t i = 0; i < NumOfCachedTBs; i++) { - const Charge q = smem.charges[i][iPadHandle]; - totalCharges += (q > 0); - consecCharges = (q > 0) ? consecCharges + 1 : 0; - maxConsecCharges = CAMath::Max(consecCharges, maxConsecCharges); - maxCharge = CAMath::Max(q, maxCharge); + + // TODO: is this really necessary? + // Why is the old version so much slower, when we just add short branches to the loop??? + if (!hasHIPTrigger) [[likely]] { + if (!acc.activeHIPTail.IsOpen()) { + ScanCachedCharges(smem, t, iPadHandle, hipTailThreshold, hipTailFilterAlpha, acc); + } else { + ScanCachedCharges(smem, t, iPadHandle, hipTailThreshold, hipTailFilterAlpha, acc); + } + } else { + if (!acc.activeHIPTail.IsOpen()) { + ScanCachedCharges(smem, t, iPadHandle, hipTailThreshold, hipTailFilterAlpha, acc); + } else { + ScanCachedCharges(smem, t, iPadHandle, hipTailThreshold, hipTailFilterAlpha, acc); + } } } GPUbarrier(); - } + + if (hasHIPTrigger) [[unlikely]] { + + DPRINTB("%d: Trigger!\n", iBlock); + + if (handlePad && acc.HIPtb < 0) { + + // Search neighboring pads for trigger + for (int16_t i = -SSClusterPadWidth; i < 0; i++) { + const auto p = iPadHandle + i; + if (p > -1) { + acc.HIPtb = CAMath::Max(smem.tails[p].start, acc.HIPtb); + } + } + + for (int16_t i = 1; i <= SSClusterPadWidth; i++) { + const auto p = iPadHandle + i; + if (p < MaxNPadsPerRow) { + acc.HIPtb = CAMath::Max(smem.tails[p].start, acc.HIPtb); + } + } + } + + bool shouldCloseTail = acc.HIPtb > -1 && acc.activeHIPTail.HasValue(); + if (shouldCloseTail && acc.activeHIPTail.IsOpen()) { + DPRINT("%d: end = %d\n", iThread, acc.HIPtb); + acc.activeHIPTail.end = acc.HIPtb; + } + + CloseHIPTails(smem, clusterer, iThread, nThreads, iPadHandle, basePos, chargeMap, acc, shouldCloseTail); + + GPUbarrier(); + + if (acc.HIPtb > -1) { + DPRINT("%d: start = %d\n", iThread, acc.HIPtb); + acc.activeHIPTail.SetOpen(acc.HIPtb); + acc.tailFilterCharge = Charge(MaxADC); + } + + // Clear smem between iterations to prevent stale entries + if (handlePad) { + smem.tails[iPadHandle].Reset(); + } + + GPUbarrier(); + + } // if (hipTriggerFound) + + } // for (uint16_t t = firstTB; t < lastTB; t += NumOfCachedTBs) if (handlePad) { - updatePadBaseline(basePos.gpad + iPadHandle, clusterer, totalCharges, maxConsecCharges, maxCharge); + updatePadBaseline(basePos.gpad + iPadHandle, clusterer, acc.totalCharges, acc.maxConsecCharges, acc.maxCharge); } + + // --- Close remaining tails + const bool shouldCloseTail = acc.activeHIPTail.HasValue(); + + // Call `work_group_any` here, instead of always counting. + // This is important as `work_group_count` is a lot slower + // and has a lot of overhead if no HIPs were found. + if (work_group_any(shouldCloseTail)) { + if (shouldCloseTail && acc.activeHIPTail.IsOpen()) { + acc.activeHIPTail.end = lastTB; + } + + [[maybe_unused]] const uint16_t nClosedTails = CloseHIPTails(smem, clusterer, iThread, nThreads, iPadHandle, basePos, chargeMap, acc, shouldCloseTail); + + DPRINTB_IF(nClosedTails > 0, "%d: Close remaining tails (%d)\n", iBlock, nClosedTails); + } + #endif } @@ -172,3 +436,131 @@ GPUd() void GPUTPCCFCheckPadBaseline::updatePadBaseline(int32_t pad, const GPUTP clusterer.mPpadIsNoisy[pad] = true; } } + +// ======== HIP Tail Connector Kernel ======== + +template <> +GPUd() void GPUTPCCFHIPTailConnector::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer) +{ + if (iBlock >= (int32_t)GPUTPCGeometry::NROWS) { + return; + } + const uint32_t row = iBlock; + + const uint32_t nTails = CAMath::Min(clusterer.mPnHIPTails[row], (uint32_t)MaxHIPTailsPerRow - 1); + + // HIP TAILS: indexing starts at 1, so 0 index indicates no connection + HIPTailDescriptor* tails = GetHIPTails(clusterer, row); + + for (uint32_t iTail = iThread + 1; iTail <= nTails; iTail += nThreads) { + auto* tail = &tails[iTail]; + + // TODO: this is needed because tailStarts may vary due to rising edge + // Better approach would be to also track the triggered timebin and match that instead + uint16_t overlapWindowStart = tail->tailStart >= 5 ? tail->tailStart - 5 : 0; + uint16_t overlapWindowEnd = tail->tailStart + 5; + + for (uint32_t jTail = iTail + 1; jTail <= nTails; jTail++) { + auto* tailNext = &tails[jTail]; + if (tailNext->iPrev > 0) { + continue; + } + + const bool overlapPad = tailNext->pad >= tail->pad - GPUTPCCFCheckPadBaseline::SSClusterPadWidth && tailNext->pad <= tail->pad + GPUTPCCFCheckPadBaseline::SSClusterPadWidth; + const bool overlapTime = tailNext->tailStart >= overlapWindowStart && tailNext->tailStart < overlapWindowEnd; + + if (overlapPad && overlapTime) { + if (CAMath::AtomicCAS(&tailNext->iPrev, 0u, iTail)) { + tail->iNext = jTail; + break; + } + } + } + } +} + +// ======== HIP Clusterizer Kernel ======== + +template <> +GPUd() void GPUTPCCFHIPClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer) +{ + if (iBlock >= (int32_t)GPUTPCGeometry::NROWS) { + return; + } + + const uint32_t row = iBlock; + uint32_t nTails = clusterer.mPnHIPTails[row]; + nTails = CAMath::Min(nTails, (uint32_t)MaxHIPTailsPerRow - 1); + + HIPTailDescriptor* tails = GetHIPTails(clusterer, row); + const auto& fragment = clusterer.mPmemory->fragment; + + tpccf::SizeT nCreatedClusters = 0; + for (uint32_t iTail = iThread + 1; iTail <= nTails; iTail += nThreads) { + + auto* tail = &tails[iTail]; + + if (tail->iPrev != 0) { + continue; + } + + float qTot = tail->qTot; + float qMax = tail->qMax; + const float firstWeight = tail->qTot; + const float firstPad = tail->pad; + const float firstTime = HIPTailTimeMean(*tail); + const float firstTimeVariance = HIPTailTimeVariance(*tail); + float padSum = firstWeight * firstPad; + float padSqSum = firstWeight * firstPad * firstPad; + float timeSum = firstWeight * firstTime; + float timeSqSum = firstWeight * (firstTime * firstTime + firstTimeVariance); + + while (tail->iNext != 0) { + + tail = &tails[tail->iNext]; + + const float tailWeight = tail->qTot; + const float tailPad = tail->pad; + const float tailTime = HIPTailTimeMean(*tail); + const float tailTimeVariance = HIPTailTimeVariance(*tail); + qMax = CAMath::Max(qMax, tail->qMax); + qTot += tail->qTot; + padSum += tailWeight * tailPad; + padSqSum += tailWeight * tailPad * tailPad; + timeSum += tailWeight * tailTime; + timeSqSum += tailWeight * (tailTime * tailTime + tailTimeVariance); + } + + const float weightSum = CAMath::Max(qTot, 1.f); + float padMean = padSum / weightSum; + float timeMean = timeSum / weightSum; // TODO: Use timebin of saturated signal instead! Time mean is biased for long tails. + float padSigma = CAMath::Sqrt(CAMath::Max(0.f, padSqSum / weightSum - padMean * padMean)); + float timeSigma = CAMath::Sqrt(CAMath::Max(0.f, timeSqSum / weightSum - timeMean * timeMean)); + + o2::tpc::ClusterNative cn; + cn.qMax = qMax; + cn.qTot = (uint16_t)CAMath::Min(qTot, 65535.f); + float clusterTime = fragment.start + timeMean - clusterer.Param().rec.tpc.clustersShiftTimebinsClusterizer; + cn.setTimeFlags(clusterTime, 0); + cn.setPad(padMean); + cn.setSigmaTime(timeSigma); + cn.setSigmaPad(padSigma); + + if (cn.qMax >= 1023) { + // Cut off clusters where the tail connection failed for some reason + // TODO: Deduplicate with GPUTPCCFClusterizer::sortIntoBuckets (can't call cross-kernel). + // TODO: Add error reporting for row cluster overflow. + uint32_t index = CAMath::AtomicAdd(&clusterer.mPclusterInRow[row], 1u); + if (index < clusterer.mNMaxClusterPerRow) { + clusterer.mPclusterByRow[clusterer.mNMaxClusterPerRow * row + index] = cn; + nCreatedClusters++; + } + } + } + +#if defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__)) + CAMath::AtomicAdd(reinterpret_cast(&clusterer.mPmemory->counters.nClusters), (unsigned long long)nCreatedClusters); +#else + CAMath::AtomicAdd(&clusterer.mPmemory->counters.nClusters, nCreatedClusters); +#endif +} diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h index 7638b95ee7f0b..f78f91a548ac9 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h @@ -15,6 +15,11 @@ /// Kernel identifies noisy TPC pads by analyzing charge patterns over time. /// A pad is marked noisy if it exceeds thresholds for total or consecutive /// time bins with charge, unless the charge exceeds a saturation threshold. +/// +/// Optionally detects Highly Ionising Particle (HIP) tails: when a saturated +/// ADC value (1023) is found, the tail region on the triggering pad and its +/// neighbors is zeroed in the charge map until an exponential charge filter +/// drops below a configurable threshold. #ifndef O2_GPU_GPU_TPC_CF_CHECK_PAD_BASELINE_H #define O2_GPU_GPU_TPC_CF_CHECK_PAD_BASELINE_H @@ -29,6 +34,16 @@ namespace o2::gpu { +struct HIPTailDescriptor { + uint32_t iPrev; + uint32_t iNext; + uint16_t pad; + uint16_t tailStart; + uint16_t tailEnd; + float qTot; + float qMax; +}; + class GPUTPCCFCheckPadBaseline : public GPUKernelTemplate { @@ -39,15 +54,65 @@ class GPUTPCCFCheckPadBaseline : public GPUKernelTemplate EntriesPerCacheline = PadsPerCacheline * TimebinsPerCacheline, NumOfCachedPads = GPUCA_WARP_SIZE / TimebinsPerCacheline, NumCLsPerWarp = GPUCA_WARP_SIZE / EntriesPerCacheline, - NumOfCachedTBs = TimebinsPerCacheline, + NumOfCachedTBs = TimebinsPerCacheline * 8, // Threads index shared memory as [iThread / MaxNPadsPerRow][iThread % MaxNPadsPerRow]. // Rounding up to a multiple of PadsPerCacheline ensures iThread / MaxNPadsPerRow < NumOfCachedTBs // for all threads, avoiding out-of-bounds access. MaxNPadsPerRow = CAMath::nextMultipleOf(GPUTPCGeometry::MaxNPadsPerRow()), + + MaxADC = 1023, + + NThreads = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFCheckPadBaseline), + SSClusterPadWidth = 5, }; - struct GPUSharedMemory { + union HipTailRange { + struct { + int16_t start; + int16_t end; + }; + + // Be careful with using default initialized values. + // Need default constructor, so can be placed in shared memory. + // Might be zero initialized, but invalid tail needs start = end = -1 instead. + GPUdDefault() HipTailRange() = default; + GPUdi() HipTailRange(int16_t st, int16_t e) : start(st), end(e) {} + + GPUdi() bool HasValue() const { return start > -1; } + GPUdi() bool IsOpen() const { return start > -1 && end < 0; } + + GPUdi() void SetOpen(int16_t st) + { + start = st; + end = -1; + } + + GPUdi() int16_t Length() const { return end - start; } + + GPUdi() void Reset() { start = end = -1; } + }; + + struct GPUSharedMemory : public GPUKernelTemplate::GPUSharedMemoryScan64 { tpccf::Charge charges[NumOfCachedTBs][MaxNPadsPerRow]; + HipTailRange tails[MaxNPadsPerRow]; + uint8_t tailsClosedPad[MaxNPadsPerRow]; + HipTailRange tailsClosed[MaxNPadsPerRow]; + uint32_t tailsClosedStoreIdx[MaxNPadsPerRow]; + tpccf::Charge tailQTotScratch[NThreads]; + tpccf::Charge tailQMaxScratch[NThreads]; + uint32_t tailStoreBase; + }; + + // Accumulated values from scanning cached charges in a pad + struct PadChargeAccu { + int32_t totalCharges = 0; + int32_t consecCharges = 0; + int32_t maxConsecCharges = 0; + tpccf::Charge maxCharge = 0; + int16_t HIPtb = -1; + int16_t aboveThresholdStart = -1; // first TB of current above-hipTailThreshold streak; used to extend the tail back over the rising edge before saturation + HipTailRange activeHIPTail{-1, -1}; + tpccf::Charge tailFilterCharge = 0; }; typedef GPUTPCClusterFinder processorType; @@ -79,6 +144,58 @@ class GPUTPCCFCheckPadBaseline : public GPUKernelTemplate GPUd() static void updatePadBaseline(int32_t pad, const GPUTPCClusterFinder&, int32_t totalCharges, int32_t consecCharges, tpccf::Charge maxCharge); }; +class GPUTPCCFHIPTailConnector : public GPUKernelTemplate +{ + public: + enum { + MaxHIPTails = 1 << 15, + MaxHIPTailsPerRow = MaxHIPTails, + }; + + struct GPUSharedMemory { + }; + + typedef GPUTPCClusterFinder processorType; + GPUhdi() static processorType* Processor(GPUConstantMem& processors) + { + return processors.tpcClusterer; + } + + GPUhdi() constexpr static gpudatatypes::RecoStep GetRecoStep() + { + return gpudatatypes::RecoStep::TPCClusterFinding; + } + + template + GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer); +}; + +class GPUTPCCFHIPClusterizer : public GPUKernelTemplate +{ + public: + enum { + MaxHIPTails = GPUTPCCFHIPTailConnector::MaxHIPTails, + MaxHIPTailsPerRow = GPUTPCCFHIPTailConnector::MaxHIPTailsPerRow, + }; + + struct GPUSharedMemory { + }; + + typedef GPUTPCClusterFinder processorType; + GPUhdi() static processorType* Processor(GPUConstantMem& processors) + { + return processors.tpcClusterer; + } + + GPUhdi() constexpr static gpudatatypes::RecoStep GetRecoStep() + { + return gpudatatypes::RecoStep::TPCClusterFinding; + } + + template + GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer); +}; + } // namespace o2::gpu #endif diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx index 7fef277138632..3d1ebbd54490e 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx @@ -477,7 +477,10 @@ GPUd() void GPUTPCCFDecodeZSLinkBase::WriteCharge(processorType& clusterer, floa CfChargePos pos(padAndRow.getRow(), padAndRow.getPad(), localTime); positions[positionOffset] = pos; - charge *= clusterer.GetConstantMem()->calibObjects.tpcPadGain->getGainCorrection(sector, padAndRow.getRow(), padAndRow.getPad()); + // Only apply gain correction if ADC not fully saturated + if (charge < 1023.f) { + charge *= clusterer.GetConstantMem()->calibObjects.tpcPadGain->getGainCorrection(sector, padAndRow.getRow(), padAndRow.getPad()); + } chargeMap[pos] = PackedCharge(charge); } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h index 3ad463f469cd6..74b76f6bf7598 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h @@ -132,7 +132,7 @@ class GPUTPCCFDecodeZSLink : public GPUTPCCFDecodeZSLinkBase { public: // constants for decoding - static inline constexpr int32_t DECODE_BITS = o2::tpc::TPCZSHDRV2::TPC_ZS_NBITS_V34; + static inline constexpr int32_t DECODE_BITS = tpc::TPCZSHDRV2::TPC_ZS_NBITS_V34; static inline constexpr float DECODE_BITS_FACTOR = 1.f / (1 << (DECODE_BITS - 10)); static inline constexpr uint32_t DECODE_MASK = (1 << DECODE_BITS) - 1; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.cxx index e34163d3803fe..67be936ab4627 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.cxx @@ -25,6 +25,7 @@ #include "CfChargePos.h" #include "CfArray2D.h" +#include "GPUTPCCFCheckPadBaseline.h" using namespace o2::gpu; using namespace o2::tpc; @@ -95,6 +96,10 @@ void* GPUTPCClusterFinder::SetPointersScratch(void* mem) if ((mRec->GetRecoStepsGPU() & gpudatatypes::RecoStep::TPCClusterFinding)) { computePointerWithAlignment(mem, mPscanBuf, mBufSize * mNBufs); } + // TODO: Use memory scalers for MaxHIPTails. + // NOTE: Always allocate since Param() is not available during size computation. + computePointerWithAlignment(mem, mPhipTailsByRow, GPUTPCGeometry::NROWS * GPUTPCCFHIPClusterizer::MaxHIPTailsPerRow); + computePointerWithAlignment(mem, mPnHIPTails, GPUTPCGeometry::NROWS); return mem; } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h index 4d036c2056cc5..bc49d225133fa 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h @@ -46,6 +46,7 @@ namespace o2::gpu { struct GPUTPCClusterMCInterimArray; struct TPCPadGainCalib; +struct HIPTailDescriptor; struct CfChargePos; @@ -113,6 +114,8 @@ class GPUTPCClusterFinder : public GPUProcessor tpc::ClusterNative* mPclusterByRow = nullptr; GPUTPCClusterMCInterimArray* mPlabelsByRow = nullptr; int32_t* mPscanBuf = nullptr; + HIPTailDescriptor* mPhipTailsByRow = nullptr; + uint32_t* mPnHIPTails = nullptr; // one counter per row Memory* mPmemory = nullptr; GPUdi() int32_t* GetScanBuffer(int32_t iBuf) const { return mPscanBuf + iBuf * mBufSize; } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx index 1e5030956df01..09440825681f4 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx @@ -15,7 +15,6 @@ #include "GPUTPCClusterFinder.h" #include "GPUReconstruction.h" #include "CfArray2D.h" -#include "DataFormatsTPC/Digit.h" #include "DataFormatsTPC/ClusterNative.h" #include "GPUSettings.h" @@ -155,7 +154,7 @@ void GPUTPCClusterFinder::DumpSuppressedPeaksCompacted(std::ostream& out) void GPUTPCClusterFinder::DumpClusters(std::ostream& out) { - out << "\nClusterer - Clusters - Sector " << mISector << " - Fragment " << mPmemory->fragment.index << "\n"; + out << "\nClusterer - Clusters - Sector " << mISector << " - All Fragments\n"; for (uint32_t i = 0; i < GPUTPCGeometry::NROWS; i++) { size_t N = mPclusterInRow[i]; diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index 2176ea2dc3804..3041c2b869de2 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -103,6 +103,8 @@ o2_gpu_add_kernel("GPUTPCDecompressionUtilKernels, sortPerSectorRow" "GPUTP o2_gpu_add_kernel("GPUTPCDecompressionUtilKernels, countFilteredClusters" "GPUTPCDecompressionKernels" LB) o2_gpu_add_kernel("GPUTPCDecompressionUtilKernels, storeFilteredClusters" "GPUTPCDecompressionKernels" LB) o2_gpu_add_kernel("GPUTPCCFCheckPadBaseline" "= TPCCLUSTERFINDER" LB) +o2_gpu_add_kernel("GPUTPCCFHIPTailConnector" "GPUTPCCFCheckPadBaseline TPCCLUSTERFINDER" LB) +o2_gpu_add_kernel("GPUTPCCFHIPClusterizer" "GPUTPCCFCheckPadBaseline TPCCLUSTERFINDER" LB) o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, fillIndexMap" "= TPCCLUSTERFINDER" LB) o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, fillFromDigits" "= TPCCLUSTERFINDER" LB) o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, findFragmentStart" "= TPCCLUSTERFINDER" LB int8_t setPositions) diff --git a/GPU/GPUTracking/utils/VcShim.h b/GPU/GPUTracking/utils/VcShim.h index 21a9a6a5c95c2..2bbc1d471bbbb 100644 --- a/GPU/GPUTracking/utils/VcShim.h +++ b/GPU/GPUTracking/utils/VcShim.h @@ -19,7 +19,7 @@ #ifndef GPUCA_NO_VC -#include +#include // IWYU pragma: export #else diff --git a/log.txt b/log.txt deleted file mode 100644 index e69de29bb2d1d..0000000000000