diff --git a/GPU/Common/GPUCommonAlgorithm.h b/GPU/Common/GPUCommonAlgorithm.h
index db57e7ec06d4b..be88973561e0a 100644
--- a/GPU/Common/GPUCommonAlgorithm.h
+++ b/GPU/Common/GPUCommonAlgorithm.h
@@ -354,6 +354,11 @@ GPUdi() uint8_t warp_broadcast_FUNC<uint8_t>(uint8_t v, int32_t i)
 #define warp_scan_inclusive_add(v) warp_scan_inclusive_add_FUNC(v)
 #define warp_broadcast(v, i) warp_broadcast_FUNC(v, i)
 
+[[nodiscard]] GPUdi() int32_t work_group_count(bool pred)
+{
+  return work_group_reduce_add((int32_t)pred);
+}
+
 #elif (defined(__CUDACC__) || defined(__HIPCC__))
 // CUDA and HIP work the same way using cub, need just different header
 
@@ -416,6 +421,16 @@ GPUdi() T warp_broadcast_FUNC(T v, int32_t i)
 #endif
 }
 
+[[nodiscard]] GPUdi() bool work_group_any(bool pred)
+{
+  return __syncthreads_or(pred);
+}
+
+[[nodiscard]] GPUdi() uint32_t work_group_count(bool pred)
+{
+  return __syncthreads_count(pred);
+}
+
 #else
 // Trivial implementation for the CPU
 
@@ -449,6 +464,16 @@ GPUdi() T warp_broadcast(T v, int32_t i)
   return v;
 }
 
+[[nodiscard]] GPUdi() bool work_group_any(bool pred)
+{
+  return pred;
+}
+
+[[nodiscard]] GPUdi() uint32_t work_group_count(bool pred)
+{
+  return pred;
+}
+
 #endif
 
 #ifdef GPUCA_ALGORITHM_STD
diff --git a/GPU/GPUTracking/DataTypes/GPUTPCExtraADC.h b/GPU/GPUTracking/DataTypes/GPUTPCExtraADC.h
new file mode 100644
index 0000000000000..64fc5da833169
--- /dev/null
+++ b/GPU/GPUTracking/DataTypes/GPUTPCExtraADC.h
@@ -0,0 +1,25 @@
+// Copyright 2019-2026 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCExtraADC.h
+/// \author Felix Weiglhofer
+
+#include "GPUDefConstantsAndSettings.h"
+#include "DataFormatsTPC/Digit.h"
+#include <array>
+#include <vector>
+
+namespace o2::gpu
+{
+struct GPUTPCExtraADC {
+  std::array<std::vector<tpc::Digit>, tpc::constants::MAXSECTOR> digitsBySector;
+};
+} // namespace o2::gpu
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 43a5f4f79abdc..eb7d67a913ceb 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -112,6 +112,9 @@ AddOptionRTC(trackletMinSharedNormFactor, float, 0.f, "", 0, "Max shared defined
 AddOptionRTC(maxTimeBinAboveThresholdIn1000Bin, uint16_t, 500, "", 0, "Except pad from cluster finding if total number of charges in a fragment is above this baseline (disable = 0)")
 AddOptionRTC(maxConsecTimeBinAboveThreshold, uint16_t, 200, "", 0, "Except pad from cluster finding if number of consecutive charges in a fragment is above this baseline (disable = 0)")
 AddOptionRTC(noisyPadSaturationThreshold, uint16_t, 700, "", 0, "Threshold where a timebin is considered saturated, disabling the noisy pad check for that pad")
+AddOptionRTC(hipTailFilter, uint8_t, 0, "", 0, "Enable Highly Ionising Particle tail filter in CheckPadBaseline (0 = disable, 1 = filter tails)")
+AddOptionRTC(hipTailFilterThreshold, uint16_t, 100, "", 0, "Threshold that must be exceeded for a timebin to be counted towards Highly Ionising Particle tail")
+AddOptionRTC(hipTailFilterAlpha, float, 0.5f, "", 0, "Smoothing factor for the exponential Highly Ionising Particle tail filter")
 AddOptionRTC(occupancyMapTimeBins, uint16_t, 16, "", 0, "Number of timebins per histogram bin of occupancy map (0 = disable occupancy map)")
 AddOptionRTC(occupancyMapTimeBinsAverage, uint16_t, 0, "", 0, "Number of timebins +/- to use for the averaging")
 AddOptionRTC(trackFitCovLimit, uint16_t, 1000, "", 0, "Abort fit when y/z cov exceed the limit")
diff --git a/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv b/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv
index ef215ba5ca870..823a70b24565b 100644
--- a/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv
+++ b/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv
@@ -60,6 +60,8 @@ GPUTPCGMO2Output_output,256,,,,,,,,,,,,,,,256
 GPUTPCStartHitsFinder,256,,"[1024, 2]","[1024, 7]",256,256,256,256,256,512,512,512,,,,608
 GPUTPCStartHitsSorter,256,,"[1024, 5]","[512, 7]",256,256,256,256,256,"[512, 1]","[512, 1]","[512, 1]",,,,608
 GPUTPCCFCheckPadBaseline,576,,"[576, 2]","[576, 2]",,,,,,"[576, 2]",,,,,,"[576, 2]"
+GPUTPCCFHIPTailConnector,256,,256,256,,,,,,256,
+GPUTPCCFHIPClusterizer,256,,256,256,,,,,,256,
 GPUTPCCFChargeMapFiller_fillIndexMap,512,,512,512,,,,,,448,,,,,,448
 GPUTPCCFChargeMapFiller_fillFromDigits,512,,512,512,,,,,,448,,,,,,448
 GPUTPCCFChargeMapFiller_findFragmentStart,512,,512,512,,,,,,448,,,,,,448
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.h b/GPU/GPUTracking/Global/GPUChainTracking.h
index 9913762ae34df..78a43856f00f1 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.h
+++ b/GPU/GPUTracking/Global/GPUChainTracking.h
@@ -71,6 +71,7 @@ struct CfFragment;
 class GPUTPCClusterFinder;
 struct GPUSettingsProcessing;
 struct GPUSettingsRec;
+struct GPUTPCExtraADC;
 
 class GPUChainTracking : public GPUChain
 {
@@ -298,13 +299,15 @@ class GPUChainTracking : public GPUChain
   int32_t RunChainFinalize();
   void OutputSanityCheck();
   int32_t RunTPCTrackingSectors_internal();
-  int32_t RunTPCClusterizer_prepare(bool restorePointers);
+  int32_t RunTPCClusterizer_prepare(bool restorePointers, const GPUTPCExtraADC& extraADCs);
 #ifndef GPUCA_RUN2
-  std::pair<uint32_t, uint32_t> RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment& fragment, int32_t lane);
+  std::pair<uint32_t, uint32_t> RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment& fragment, int32_t lane, const GPUTPCExtraADC& extraADCs);
   void RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int32_t stage, bool doGPU, int32_t lane);
   std::pair<uint32_t, uint32_t> TPCClusterizerDecodeZSCount(uint32_t iSector, const CfFragment& fragment);
   std::pair<uint32_t, uint32_t> TPCClusterizerDecodeZSCountUpdate(uint32_t iSector, const CfFragment& fragment);
   void TPCClusterizerEnsureZSOffsets(uint32_t iSector, const CfFragment& fragment);
+  void TPCClusterizerTransferExtraADC(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int lane, const GPUTPCExtraADC& extraADCs);
+  void TPCClusterizerCheckExtraADCZeros(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int lane, const GPUTPCExtraADC& extraADCs);
 #endif
   void RunTPCTrackingMerger_MergeBorderTracks(uint8_t mergeMode, GPUReconstruction::krnlDeviceType deviceType);
   void RunTPCTrackingMerger_Resolve(int8_t useOrigTrackParam, int8_t mergeAll, GPUReconstruction::krnlDeviceType deviceType);
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index a2a07be7832ca..4609c9cd59bed 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -17,6 +17,7 @@
 #include "GPUChainTrackingDebug.h"
 #include "GPULogging.h"
 #include "GPUO2DataTypes.h"
+#include "GPUTPCExtraADC.h"
 #include "GPUMemorySizeScalers.h"
 #include "GPUTrackingInputProvider.h"
 #include "GPUNewCalibValues.h"
@@ -56,10 +57,13 @@
 
 #include "utils/VcShim.h"
 #include "utils/strtag.h"
-#include <fstream>
+#include "utils/vecpod.h"
 #include <numeric>
+#include <random>
 #include <vector>
 
+// #define INSERT_SATURATED_SIGNALS
+
 using namespace o2::gpu;
 using namespace o2::tpc;
 using namespace o2::tpc::constants;
@@ -155,11 +159,171 @@ void GPUChainTracking::TPCClusterizerEnsureZSOffsets(uint32_t iSector, const CfF
   }
 }
 
+void GPUChainTracking::TPCClusterizerTransferExtraADC(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int lane, const GPUTPCExtraADC& extraADCs)
+{
+  const int32_t iSector = clusterer.mISector;
+  const auto& fragment = clusterer.mPmemory->fragment;
+  const auto& digits = extraADCs.digitsBySector[iSector];
+
+  if (fragment.index != 0) {
+    return;
+  }
+
+  if (digits.empty()) {
+    return;
+  }
+
+  const size_t chargeMapSize = TPCMapMemoryLayout<PackedCharge>::items(GetProcessingSettings().overrideClusterizerFragmentLen);
+  const size_t chargeMapSizeBytes = chargeMapSize * sizeof(PackedCharge);
+
+  vecpod<uint16_t> chargeMapHostData;
+  chargeMapHostData.resize(chargeMapSize);
+
+  CfArray2D<PackedCharge> chargeMapHost(reinterpret_cast<PackedCharge*>(chargeMapHostData.data()));
+
+  vecpod<CfChargePos> extraPositions;
+  extraPositions.reserve(digits.size());
+
+  GPUMemCpy(RecoStep::TPCClusterFinding, chargeMapHostData.data(), clustererShadow.mPchargeMap, chargeMapSizeBytes, lane, false);
+  SynchronizeStream(lane);
+
+  for (const auto& d : digits) {
+    if (!fragment.contains(d.getTimeStamp())) {
+      continue;
+    }
+
+    CfChargePos pos{(tpccf::Row)d.getRow(), (tpccf::Pad)d.getPad(), (tpccf::TPCFragmentTime)(d.getTimeStamp() - fragment.start)};
+    chargeMapHost[pos] = PackedCharge(d.getChargeFloat());
+
+    extraPositions.push_back(pos);
+  }
+
+  GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPchargeMap, chargeMapHostData.data(), chargeMapSizeBytes, lane, true);
+
+  const size_t nPositions = clusterer.mPmemory->counters.nPositions;
+  const size_t extraPositionsOffset = nPositions - extraPositions.size();
+  GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPpositions + extraPositionsOffset, extraPositions.data(), extraPositions.size() * sizeof(CfChargePos), lane, true);
+}
+
+void GPUChainTracking::TPCClusterizerCheckExtraADCZeros(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int lane, const GPUTPCExtraADC& extraADCs)
+{
+  const int32_t iSector = clusterer.mISector;
+  const auto& fragment = clusterer.mPmemory->fragment;
+  const auto& digits = extraADCs.digitsBySector[iSector];
+
+  if (fragment.index != 0) {
+    return;
+  }
+
+  if (digits.empty()) {
+    return;
+  }
+
+  const size_t chargeMapSize = TPCMapMemoryLayout<PackedCharge>::items(GetProcessingSettings().overrideClusterizerFragmentLen);
+  const size_t chargeMapSizeBytes = chargeMapSize * sizeof(PackedCharge);
+
+  vecpod<uint16_t> chargeMapHostData;
+  chargeMapHostData.resize(chargeMapSize);
+
+  CfArray2D<PackedCharge> chargeMapHost(reinterpret_cast<PackedCharge*>(chargeMapHostData.data()));
+
+  GPUMemCpy(RecoStep::TPCClusterFinding, chargeMapHostData.data(), clustererShadow.mPchargeMap, chargeMapSizeBytes, lane, false);
+  SynchronizeStream(lane);
+
+  size_t nNonZeroADCs = 0;
+
+  for (const auto& d : digits) {
+    if (!fragment.contains(d.getTimeStamp())) {
+      continue;
+    }
+
+    CfChargePos pos{(tpccf::Row)d.getRow(), (tpccf::Pad)d.getPad(), (tpccf::TPCFragmentTime)(d.getTimeStamp() - fragment.start)};
+
+    auto adc = chargeMapHost[pos].unpack();
+
+    if (adc != 0) {
+      nNonZeroADCs++;
+    }
+  }
+
+  if (nNonZeroADCs > 0) {
+    GPUInfo("Non Zero ADCs: %zu", nNonZeroADCs);
+  } else {
+    GPUInfo("Cleared all extra ADC values!", nNonZeroADCs);
+  }
+}
+
 namespace
 {
 struct TPCCFDecodeScanTmp {
   int32_t zsPtrFirst, zsPageFirst, zsPtrLast, zsPageLast, hasData, pageCounter;
 };
+
+// Additional ADC values must be generated at start of clusterizer
+// This is required, so enough memory is allocated for the charge points
+// And ADCs can be injected by "simply"
+//  -> copying chargeMap + chargePositions to host
+//  -> writing additional adcs to chargeMap + positions
+//  -> copying values to device
+GPUTPCExtraADC GenerateSaturatedSignals(size_t seed = 42)
+{
+  constexpr int32_t MinTailLength = 50;
+  constexpr int32_t MaxTailLength = 200;
+  constexpr int32_t TailWidth = 3; // Assume tails are 3 pads wide at the moment
+
+  constexpr GPUTPCGeometry geo;
+
+  GPUTPCExtraADC adcs;
+
+  const int32_t nHIPs = 50;
+  const int32_t firstTB = 0;                   // Place all HIPs in first fragment for now
+  const int32_t lastTB = 4000 - MaxTailLength; // Don't allow cut off tails at fragment borders
+  const int32_t tailADC = 250;                 // charge should decrease over time, but for now just hardcode ADC above the threshold
+
+  std::mt19937 gen{seed};
+  std::uniform_int_distribution<> randomRow(0, GPUTPCGeometry::NROWS - 1);
+  std::uniform_int_distribution<> randomTB(firstTB, lastTB);
+  std::uniform_int_distribution<> randomTailLength(MinTailLength, MaxTailLength);
+  // std::normal_distribution<> tailLengthNoise(8, 2.0);
+
+  for (int32_t iHIP = 0; iHIP < nHIPs; iHIP++) {
+
+    const int32_t row = randomRow(gen);
+    const int32_t nPads = geo.NPads(row);
+    std::uniform_int_distribution<> randomPad(0, nPads - 1);
+
+    const int32_t basePad = randomPad(gen);
+    const int32_t baseTb = randomTB(gen);
+
+    auto& digits = adcs.digitsBySector[0];
+
+    const int32_t tailLength = randomTailLength(gen);
+
+    for (int32_t dPad = -TailWidth; dPad <= TailWidth; dPad++) {
+      const int32_t iPad = basePad + dPad;
+      if (iPad < 0 || iPad >= nPads) {
+        continue;
+      }
+
+      for (int32_t dTime = 0; dTime < tailLength; dTime++) {
+        const int32_t iTime = baseTb + dTime;
+
+        if (iTime >= 4000) {
+          break;
+        }
+
+        const auto adc = dTime == 0 && dPad == 0 ? 1023 : tailADC;
+
+        digits.emplace_back(0, adc, row, iPad, iTime);
+      }
+    }
+  }
+
+  GPUInfo("Generated %zu ADCs!", adcs.digitsBySector[0].size());
+
+  return adcs;
+}
+
 } // namespace
 
 std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint32_t iSector, const CfFragment& fragment)
@@ -437,13 +601,16 @@ void GPUChainTracking::RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder& clust
   }
 }
 
-std::pair<uint32_t, uint32_t> GPUChainTracking::RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment& fragment, int32_t lane)
+std::pair<uint32_t, uint32_t> GPUChainTracking::RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment& fragment, int32_t lane, const GPUTPCExtraADC& extraADCs)
 {
   bool doGPU = GetRecoStepsGPU() & RecoStep::TPCClusterFinding;
   if (mCFContext->abandonTimeframe) {
     return {0, 0};
   }
-  const auto& retVal = TPCClusterizerDecodeZSCountUpdate(iSector, fragment);
+  auto retVal = TPCClusterizerDecodeZSCountUpdate(iSector, fragment);
+  if (fragment.index == 0) {
+    retVal.first += extraADCs.digitsBySector[iSector].size();
+  }
   if (doGPU) {
     GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
     GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
@@ -473,7 +640,7 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::RunTPCClusterizer_transferZS(int
   return retVal;
 }
 
-int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
+int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers, const GPUTPCExtraADC& extraADCs)
 {
   bool doGPU = mRec->GetRecoStepsGPU() & gpudatatypes::RecoStep::TPCClusterFinding;
   if (restorePointers) {
@@ -569,7 +736,7 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
   mCFContext->fragmentFirst = CfFragment{std::max<int32_t>(mCFContext->tpcMaxTimeBin + 1, maxFragmentLen), maxFragmentLen};
   for (int32_t iSector = 0; iSector < GetProcessingSettings().nTPCClustererLanes && iSector < NSECTORS; iSector++) {
     if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) {
-      mCFContext->nextPos[iSector] = RunTPCClusterizer_transferZS(iSector, mCFContext->fragmentFirst, GetProcessingSettings().nTPCClustererLanes + iSector);
+      mCFContext->nextPos[iSector] = RunTPCClusterizer_transferZS(iSector, mCFContext->fragmentFirst, GetProcessingSettings().nTPCClustererLanes + iSector, extraADCs);
     }
   }
 
@@ -595,7 +762,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   mRec->PushNonPersistentMemory(qStr2Tag("TPCCLUST"));
   const auto& threadContext = GetThreadContext();
   const bool doGPU = GetRecoStepsGPU() & RecoStep::TPCClusterFinding;
-  if (RunTPCClusterizer_prepare(mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer)) {
+
+  GPUTPCExtraADC extraADCs;
+#ifdef INSERT_SATURATED_SIGNALS
+  extraADCs = GenerateSaturatedSignals();
+#endif
+
+  if (RunTPCClusterizer_prepare(mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer, extraADCs)) {
     return 1;
   }
   if (GetProcessingSettings().autoAdjustHostThreads && !doGPU) {
@@ -625,7 +798,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     SetupGPUProcessor(&processors()->tpcClusterer[iSector], true); // Now we allocate
   }
   if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) {
-    RunTPCClusterizer_prepare(true); // Restore some pointers, allocated by the other pipeline, and set to 0 by SetupGPUProcessor (since not allocated in this pipeline)
+    RunTPCClusterizer_prepare(true, extraADCs); // Restore some pointers, allocated by the other pipeline, and set to 0 by SetupGPUProcessor (since not allocated in this pipeline)
   }
 
   if (doGPU && mIOPtrs.tpcZS) {
@@ -769,6 +942,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   const bool propagateMCLabels = buildNativeHost && GetProcessingSettings().runMC && processors()->ioPtrs.tpcPackedDigits && processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC;
   const bool sortClusters = buildNativeHost && (GetProcessingSettings().deterministicGPUReconstruction || GetProcessingSettings().debugLevel >= 4);
 
+  if (GetProcessingSettings().runMC && (!processors()->ioPtrs.tpcPackedDigits || !processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC)) {
+    GPUWarning("Requested to process MC labels, but no labels present");
+  }
+
   auto* digitsMC = propagateMCLabels ? processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC : nullptr;
 
   mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = mRec->MemoryScalers()->nTPCHits * tpcHitLowOccupancyScalingFactor;
@@ -938,7 +1115,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             f = mCFContext->fragmentFirst;
           }
           if (nextSector < NSECTORS && mIOPtrs.tpcZS && mCFContext->nPagesSector[nextSector] && mCFContext->zsVersion != -1 && !mCFContext->abandonTimeframe) {
-            mCFContext->nextPos[nextSector] = RunTPCClusterizer_transferZS(nextSector, f, GetProcessingSettings().nTPCClustererLanes + lane);
+            mCFContext->nextPos[nextSector] = RunTPCClusterizer_transferZS(nextSector, f, GetProcessingSettings().nTPCClustererLanes + lane, extraADCs);
           }
         }
         GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
@@ -949,9 +1126,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         if (!mIOPtrs.tpcZS) {
           runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::fillFromDigits>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
         }
-        if (DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererDigits, clusterer, &GPUTPCClusterFinder::DumpDigits, *mDebugFile)) {
-          clusterer.DumpChargeMap(*mDebugFile, "Charges");
-        }
+
+        TPCClusterizerTransferExtraADC(clusterer, clustererShadow, lane, extraADCs);
 
         if (propagateMCLabels) {
           runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::fillIndexMap>({GetGrid(clusterer.mPmemory->counters.nDigitsInFragment, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}});
@@ -960,14 +1136,33 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         bool checkForNoisyPads = (rec()->GetParam().rec.tpc.maxTimeBinAboveThresholdIn1000Bin > 0) || (rec()->GetParam().rec.tpc.maxConsecTimeBinAboveThreshold > 0);
         checkForNoisyPads &= (rec()->GetParam().rec.tpc.noisyPadsQuickCheck ? fragment.index == 0 : true);
         checkForNoisyPads &= !GetProcessingSettings().disableTPCNoisyPadFilter;
+        // TODO Move hipTailFilter flag to ProcessingSettings?
+        // TODO Add some warning when re enabling pad filter with this flag, so it's not just silently enabled when disabling was requested
+        checkForNoisyPads |= rec()->GetParam().rec.tpc.hipTailFilter;
+
+        if (rec()->GetParam().rec.tpc.hipTailFilter && !doGPU) {
+          GPUError("HIP tail filter enabled, but this is currently not supported on CPU");
+        }
 
         if (checkForNoisyPads) {
+          if (rec()->GetParam().rec.tpc.hipTailFilter) {
+            runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPhipTailsByRow, GPUTPCGeometry::NROWS * sizeof(*clustererShadow.mPhipTailsByRow) * GPUTPCCFHIPClusterizer::MaxHIPTailsPerRow);
+            runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPnHIPTails, GPUTPCGeometry::NROWS * sizeof(*clustererShadow.mPnHIPTails));
+          }
           const int32_t nBlocks = GPUTPCCFCheckPadBaseline::GetNBlocks(doGPU);
 
           runKernel<GPUTPCCFCheckPadBaseline>({GetGridBlk(nBlocks, lane), {iSector}});
           getKernelTimer<GPUTPCCFCheckPadBaseline>(RecoStep::TPCClusterFinding, iSector, TPC_REAL_PADS_IN_SECTOR * fragment.lengthWithoutOverlap() * sizeof(PackedCharge), false);
         }
 
+        DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererDigits, clusterer, &GPUTPCClusterFinder::DumpDigits, *mDebugFile);
+        // Avoid additional sync when also dumping digits
+        const bool debugSyncChargeMap = !(GetProcessingSettings().debugMask & GPUChainTrackingDebugFlags::TPCClustererDigits);
+        // DumpChargeMap should run after noisy pad filter to avoid yet another dump of intermediate data. When chargemap without pad filter is required, disable pad filter instead.
+        DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, debugSyncChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Charges");
+
+        TPCClusterizerCheckExtraADCZeros(clusterer, clustererShadow, lane, extraADCs);
+
         runKernel<GPUTPCCFPeakFinder>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
         if (DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererPeaks, clusterer, &GPUTPCClusterFinder::DumpPeaks, *mDebugFile)) {
           clusterer.DumpPeakMap(*mDebugFile, "Peaks");
@@ -1002,6 +1197,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
         GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
 
+        if (clusterer.mPmemory->counters.nPositions == 0) {
+          return;
+        }
+
         if (doGPU) {
           SynchronizeStream(lane);
         }
@@ -1015,11 +1214,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding), krnlRunRangeNone, {nullptr, waitEvent}}, clustererShadow.mPclusterInRow, GPUTPCGeometry::NROWS * sizeof(*clustererShadow.mPclusterInRow));
         }
 
-        if (clusterer.mPmemory->counters.nClusters == 0) {
-          return;
-        }
-
-        if (GetProcessingSettings().nn.applyNNclusterizer) {
+        const auto nRegularClusters = clusterer.mPmemory->counters.nClusters;
+        if (nRegularClusters != 0) {
+          if (GetProcessingSettings().nn.applyNNclusterizer) {
 #ifdef GPUCA_HAS_ONNX
           GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[lane];
           GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN;
@@ -1144,7 +1341,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             if(!nn_settings.nnClusterizerApplyCfDeconvolution) { // If it is already applied don't do it twice, otherwise apply now
               runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, true);
             }
-            DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
+            DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMapSplit, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
             runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
             if (nn_settings.nnClusterizerVerbosity > 3) {
               LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with CF regression. (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
@@ -1155,16 +1352,31 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 #endif
         } else {
           runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, true);
-          DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
+          DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMapSplit, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
           runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSector}}, 0);
         }
 
-        if (doGPU && propagateMCLabels) {
-          TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane);
-          if (doGPU) {
+          if (doGPU && propagateMCLabels) {
+            TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane);
+            if (doGPU) {
+              SynchronizeStream(lane);
+            }
+            runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, 1); // Computes MC labels
+          }
+        }
+
+        // TODO: Move this right after CheckPadBaseline once tail zeroing is moved into this kernel.
+        if (rec()->GetParam().rec.tpc.hipTailFilter) {
+          runKernel<GPUTPCCFHIPTailConnector>({GetGridBlk(GPUTPCGeometry::NROWS, lane), {iSector}});
+          runKernel<GPUTPCCFHIPClusterizer>({GetGridBlk(GPUTPCGeometry::NROWS, lane), {iSector}});
+          if (doGPU && (nRegularClusters == 0 || GetProcessingSettings().debugLevel >= 3)) {
+            TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
             SynchronizeStream(lane);
           }
-          runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, 1); // Computes MC labels
+        }
+
+        if (clusterer.mPmemory->counters.nClusters == 0) {
+          return;
         }
 
         if (GetProcessingSettings().debugLevel >= 3) {
@@ -1173,8 +1385,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
         TransferMemoryResourcesToHost(RecoStep::TPCClusterFinding, &clusterer, lane);
         laneHasData[lane] = true;
-        // Include clusters in default debug mask, exclude other debug output by default
-        DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererClusters, clusterer, &GPUTPCClusterFinder::DumpClusters, *mDebugFile); // clang-format off
       });
       mRec->SetNActiveThreadsOuterLoop(1);
     }
@@ -1192,6 +1402,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
       if (laneHasData[lane]) {
         anyLaneHasData = true;
+        // Include clusters in default debug mask, exclude other debug output by default.
+        // The cluster buffers are accumulated per sector, so dump them once after all fragments.
+        DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererClusters, clusterer, &GPUTPCClusterFinder::DumpClusters, *mDebugFile); // clang-format off
         if (buildNativeGPU && GetProcessingSettings().tpccfGatherKernel) {
           runKernel<GPUTPCCFGather>({GetGridBlk(GPUTPCGeometry::NROWS, mRec->NStreams() - 1), {iSector}}, &mInputsShadow->mPclusterNativeBuffer[nClsTotal]);
         }
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index dda15d403407e..6ca50f24351c4 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -16,6 +16,7 @@
 #include "GPUChainTrackingDebug.h"
 #include "GPULogging.h"
 #include "GPUO2DataTypes.h"
+#include "GPUTPCExtraADC.h"
 #include "GPUTrackingInputProvider.h"
 #include "GPUTPCCFChainContext.h"
 #include "TPCClusterDecompressor.h"
@@ -63,7 +64,7 @@ int32_t GPUChainTracking::RunTPCCompression()
   if (mPipelineFinalizationCtx && GetProcessingSettings().doublePipelineClusterizer) {
     SynchronizeEventAndRelease(mEvents->single);
     auto* foreignChain = (GPUChainTracking*)GetNextChainInQueue();
-    foreignChain->RunTPCClusterizer_prepare(false);
+    foreignChain->RunTPCClusterizer_prepare(false, {});
     foreignChain->mCFContext->ptrClusterNativeSave = processorsShadow()->ioPtrs.clustersNative;
   }
 #endif
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingDebug.h b/GPU/GPUTracking/Global/GPUChainTrackingDebug.h
index a0be9d833d5a9..39e48f9f14d9a 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingDebug.h
+++ b/GPU/GPUTracking/Global/GPUChainTrackingDebug.h
@@ -45,7 +45,8 @@ enum GPUChainTrackingDebugFlags : uint32_t {
   TPCClustererPeaks = 1 << 19,
   TPCClustererSuppressedPeaks = 1 << 20,
   TPCClustererChargeMap = 1 << 21,
-  TPCClustererZeroedCharges = 1 << 22
+  TPCClustererChargeMapSplit = 1 << 22,
+  TPCClustererZeroedCharges = 1 << 23
 };
 
 template <class T, class S, typename... Args>
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx
index 204d9d6a8b81a..3c05284d3b876 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx
@@ -16,14 +16,187 @@
 #include "CfArray2D.h"
 #include "PackedCharge.h"
 #include "clusterFinderDefs.h"
+#include "DataFormatsTPC/ClusterNative.h"
 
 #ifndef GPUCA_GPUCODE
 #include "utils/VcShim.h"
 #endif
 
+#if 0
+#define DPRINT(...) printf(__VA_ARGS__)
+#define DPRINTB(...) \
+  if (iThread == 0)  \
+  printf(__VA_ARGS__)
+#define DPRINTB_IF(test, ...) \
+  if (iThread == 0 && (test)) \
+  printf(__VA_ARGS__)
+#else
+#define DPRINT(...) ((void)0)
+#define DPRINTB(...) ((void)0)
+#define DPRINTB_IF(test, ...) ((void)0)
+#endif
+
 using namespace o2::gpu;
 using namespace o2::gpu::tpccf;
 
+using Kernel = GPUTPCCFCheckPadBaseline;
+
+static GPUdi() HIPTailDescriptor* GetHIPTails(GPUTPCClusterFinder& clusterer, int32_t row)
+{
+  // HIP TAILS: indexing starts at 1, so 0 index indicates no connection
+  return clusterer.mPhipTailsByRow + row * GPUTPCCFHIPClusterizer::MaxHIPTailsPerRow;
+}
+
+static GPUdi() Charge UpdateHIPTailFilter(Charge filteredCharge, Charge charge, Charge alpha)
+{
+  return filteredCharge + alpha * (charge - filteredCharge);
+}
+
+static GPUdi() float HIPTailTimeMean(const HIPTailDescriptor& tail)
+{
+  const float length = tail.tailEnd > tail.tailStart ? float(tail.tailEnd - tail.tailStart) : 1.f;
+  return tail.tailStart + 0.5f * (length - 1.f);
+}
+
+static GPUdi() float HIPTailTimeVariance(const HIPTailDescriptor& tail)
+{
+  const float length = tail.tailEnd > tail.tailStart ? float(tail.tailEnd - tail.tailStart) : 1.f;
+  return (length * length - 1.f) * (1.f / 12.f);
+}
+
+// Collect tails marked for closing across the workgroup using a prefix scan,
+// then cooperatively zero the charge map entries for each closed tail.
+// Caller must set acc.activeHIPTail.end before calling if the tail is open.
+static GPUdi() uint16_t CloseHIPTails(
+  Kernel::GPUSharedMemory& smem,
+  GPUTPCClusterFinder& clusterer,
+  int32_t iThread, int32_t nThreads,
+  int16_t iPadHandle,
+  CfChargePos basePos,
+  CfArray2D<PackedCharge>& chargeMap,
+  Kernel::PadChargeAccu& acc,
+  bool shouldCloseTail)
+{
+  const uint32_t row = basePos.row();
+  const uint16_t nClosedTails = work_group_count(shouldCloseTail);
+
+  auto* nHIPTails = clusterer.mPnHIPTails;
+  auto* hipTails = GetHIPTails(clusterer, row);
+
+  if (nClosedTails > 0) {
+    int16_t iClosedTail = work_group_scan_inclusive_add((int16_t)shouldCloseTail) - 1;
+    const bool shouldStoreTail = shouldCloseTail && acc.activeHIPTail.Length() > 0;
+    uint16_t nStoredTails = work_group_count(shouldStoreTail);
+    int16_t iStoredTail = work_group_scan_inclusive_add((int16_t)shouldStoreTail) - 1;
+
+    // Use exactly one atomic add per closing call to reduce differences in
+    // tail ordering between runs.
+    if (nStoredTails > 0) {
+      if (iThread == 0) {
+        smem.tailStoreBase = CAMath::AtomicAdd(&nHIPTails[row], (uint32_t)nStoredTails);
+      }
+      GPUbarrier();
+    }
+    if (shouldCloseTail) {
+      smem.tailsClosedPad[iClosedTail] = iPadHandle;
+      smem.tailsClosed[iClosedTail] = acc.activeHIPTail;
+      smem.tailsClosedStoreIdx[iClosedTail] = GPUTPCCFHIPTailConnector::MaxHIPTailsPerRow;
+
+      if (shouldStoreTail) {
+        const uint32_t idx = smem.tailStoreBase + iStoredTail + 1;
+        smem.tailsClosedStoreIdx[iClosedTail] = idx;
+        if (idx < GPUTPCCFHIPTailConnector::MaxHIPTailsPerRow) {
+          hipTails[idx] = {0, 0, (uint16_t)iPadHandle,
+                           (uint16_t)acc.activeHIPTail.start, (uint16_t)acc.activeHIPTail.end,
+                           0.f, 0.f};
+        }
+      }
+
+      acc.tailFilterCharge = 0;
+      acc.activeHIPTail.Reset();
+    }
+
+    GPUbarrier();
+  }
+
+  // TODO: performance improvement -> parallelize this loop across tails
+  for (uint16_t iTail = 0; iTail < nClosedTails; iTail++) {
+    const auto tailPad = smem.tailsClosedPad[iTail];
+    const auto tail = smem.tailsClosed[iTail];
+    const uint32_t tailStoreIdx = smem.tailsClosedStoreIdx[iTail];
+
+    Charge qTot = 0.f;
+    Charge qMax = 0.f;
+    for (uint16_t iTime = iThread; iTime < tail.Length(); iTime += nThreads) {
+      const int16_t time = tail.start + iTime;
+      auto pos = basePos.delta({tailPad, time});
+      const Charge q = chargeMap[pos].unpack();
+      qTot += q;
+      qMax = CAMath::Max(qMax, q);
+      chargeMap[pos] = PackedCharge{0};
+    }
+
+    smem.tailQTotScratch[iThread] = qTot;
+    smem.tailQMaxScratch[iThread] = qMax;
+    GPUbarrier();
+    for (uint16_t active = nThreads; active > 1;) {
+      const uint16_t stride = (active + 1) / 2;
+      if (iThread < active - stride) {
+        smem.tailQTotScratch[iThread] += smem.tailQTotScratch[iThread + stride];
+        smem.tailQMaxScratch[iThread] = CAMath::Max(smem.tailQMaxScratch[iThread], smem.tailQMaxScratch[iThread + stride]);
+      }
+      active = stride;
+      GPUbarrier();
+    }
+
+    if (iThread == 0 && tailStoreIdx < GPUTPCCFHIPTailConnector::MaxHIPTailsPerRow) {
+      HIPTailDescriptor& tailDescriptor = hipTails[tailStoreIdx];
+      tailDescriptor.qTot = smem.tailQTotScratch[0];
+      tailDescriptor.qMax = smem.tailQMaxScratch[0];
+    }
+  }
+
+  return nClosedTails;
+}
+
+template <bool CheckHIPTrigger, bool CheckHIPTailEnd>
+static GPUdi() void ScanCachedCharges(Kernel::GPUSharedMemory& smem, uint16_t timeOffset, uint16_t pad, Charge hipTailThreshold, Charge hipTailFilterAlpha, Kernel::PadChargeAccu& acc)
+{
+  for (int32_t i = 0; i < Kernel::NumOfCachedTBs; i++) {
+    const Charge qs = smem.charges[i][pad];
+    const int16_t curTB = timeOffset + i;
+
+    acc.totalCharges += qs > 0;
+    acc.consecCharges = qs > 0 ? acc.consecCharges + 1 : 0;
+    acc.maxConsecCharges = CAMath::Max(acc.consecCharges, acc.maxConsecCharges);
+    acc.maxCharge = CAMath::Max<Charge>(qs, acc.maxCharge);
+
+    if (qs >= hipTailThreshold) {
+      if (acc.aboveThresholdStart < 0) {
+        acc.aboveThresholdStart = curTB;
+      }
+    } else {
+      acc.aboveThresholdStart = -1;
+    }
+
+    if constexpr (CheckHIPTrigger) {
+      if (acc.HIPtb < 0 && qs >= Charge(Kernel::MaxADC)) {
+        acc.HIPtb = acc.aboveThresholdStart; // start of rising edge, not first sat TB
+        smem.tails[pad] = {acc.HIPtb, 0};    // Broadcast HIP start TB to neighboring pads / threads
+      }
+    }
+
+    if constexpr (CheckHIPTailEnd) {
+      if (acc.activeHIPTail.IsOpen()) {
+        acc.tailFilterCharge = UpdateHIPTailFilter(acc.tailFilterCharge, qs, hipTailFilterAlpha);
+        if (acc.tailFilterCharge < hipTailThreshold) {
+          acc.activeHIPTail.end = curTB;
+        }
+      }
+    }
+  }
+}
+
 template <>
 GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
 {
@@ -49,6 +222,9 @@ GPUd() void GPUTPCCFCheckPadBaseline::CheckBaselineGPU(int32_t nBlocks, int32_t
   }
 
   const CfFragment& fragment = clusterer.mPmemory->fragment;
+  const bool hipFilterOn = clusterer.Param().rec.tpc.hipTailFilter;
+  const Charge hipTailThreshold = clusterer.Param().rec.tpc.hipTailFilterThreshold;
+  const Charge hipTailFilterAlpha = clusterer.Param().rec.tpc.hipTailFilterAlpha;
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
 
   constexpr GPUTPCGeometry geo;
@@ -57,45 +233,133 @@ GPUd() void GPUTPCCFCheckPadBaseline::CheckBaselineGPU(int32_t nBlocks, int32_t
   const auto nPads = geo.NPads(iRow);
   const CfChargePos basePos{(Row)iRow, 0, 0};
 
-  int32_t totalCharges = 0;
-  int32_t consecCharges = 0;
-  int32_t maxConsecCharges = 0;
-  Charge maxCharge = 0;
+  PadChargeAccu acc;
 
   const int16_t iPadOffset = iThread % MaxNPadsPerRow;
   const int16_t iTimeOffset = iThread / MaxNPadsPerRow;
   const int16_t iPadHandle = iThread;
   const bool handlePad = iPadHandle < nPads;
 
-  const auto firstTB = fragment.firstNonOverlapTimeBin();
-  const auto lastTB = fragment.lastNonOverlapTimeBin();
+  if (iPadHandle < MaxNPadsPerRow) {
+    smem.tails[iPadHandle] = {-1, -1};
+  }
+  GPUbarrier();
 
-  for (auto t = firstTB; t < lastTB; t += NumOfCachedTBs) {
+  // Pad filter scans the entire fragments including overlap.
+  // Minimal runtime overhead and prevents headaches later on as
+  // saturated signal in overlap region can create tails in the next fragment
+  // even when cleared in current fragment as they're decoded twice
+  const TPCFragmentTime firstTB = 0;
+  const TPCFragmentTime lastTB = fragment.length;
 
-    const TPCFragmentTime iTime = t + iTimeOffset;
+  for (uint16_t t = firstTB; t < lastTB; t += NumOfCachedTBs) {
 
-    const CfChargePos pos = basePos.delta({iPadOffset, iTime});
+    bool thisThreadHasTrigger = false;
+    for (uint16_t tt = 0; tt < NumOfCachedTBs; tt += TimebinsPerCacheline) {
+      const TPCFragmentTime iTimeLoad = t + tt + iTimeOffset;
 
-    smem.charges[iTimeOffset][iPadOffset] = iTime < lastTB && iPadOffset < nPads ? chargeMap[pos].unpack() : 0;
+      const CfChargePos pos = basePos.delta({iPadOffset, iTimeLoad});
 
-    GPUbarrier();
+      const Charge ql = iTimeLoad < lastTB && iPadOffset < nPads ? chargeMap[pos].unpack() : 0;
+      smem.charges[tt + iTimeOffset][iPadOffset] = ql;
+
+      thisThreadHasTrigger |= ql >= Charge(MaxADC);
+    }
+
+    const bool hasHIPTrigger = hipFilterOn && work_group_any(thisThreadHasTrigger);
+
+    acc.HIPtb = -1;
 
     if (handlePad) {
-      for (int32_t i = 0; i < NumOfCachedTBs; i++) {
-        const Charge q = smem.charges[i][iPadHandle];
-        totalCharges += (q > 0);
-        consecCharges = (q > 0) ? consecCharges + 1 : 0;
-        maxConsecCharges = CAMath::Max(consecCharges, maxConsecCharges);
-        maxCharge = CAMath::Max<Charge>(q, maxCharge);
+
+      // TODO: is this really necessary?
+      // Why is the old version so much slower, when we just add short branches to the loop???
+      if (!hasHIPTrigger) [[likely]] {
+        if (!acc.activeHIPTail.IsOpen()) {
+          ScanCachedCharges<false, false>(smem, t, iPadHandle, hipTailThreshold, hipTailFilterAlpha, acc);
+        } else {
+          ScanCachedCharges<false, true>(smem, t, iPadHandle, hipTailThreshold, hipTailFilterAlpha, acc);
+        }
+      } else {
+        if (!acc.activeHIPTail.IsOpen()) {
+          ScanCachedCharges<true, false>(smem, t, iPadHandle, hipTailThreshold, hipTailFilterAlpha, acc);
+        } else {
+          ScanCachedCharges<true, true>(smem, t, iPadHandle, hipTailThreshold, hipTailFilterAlpha, acc);
+        }
       }
     }
 
     GPUbarrier();
-  }
+
+    if (hasHIPTrigger) [[unlikely]] {
+
+      DPRINTB("%d: Trigger!\n", iBlock);
+
+      if (handlePad && acc.HIPtb < 0) {
+
+        // Search neighboring pads for trigger
+        for (int16_t i = -SSClusterPadWidth; i < 0; i++) {
+          const auto p = iPadHandle + i;
+          if (p > -1) {
+            acc.HIPtb = CAMath::Max(smem.tails[p].start, acc.HIPtb);
+          }
+        }
+
+        for (int16_t i = 1; i <= SSClusterPadWidth; i++) {
+          const auto p = iPadHandle + i;
+          if (p < MaxNPadsPerRow) {
+            acc.HIPtb = CAMath::Max(smem.tails[p].start, acc.HIPtb);
+          }
+        }
+      }
+
+      bool shouldCloseTail = acc.HIPtb > -1 && acc.activeHIPTail.HasValue();
+      if (shouldCloseTail && acc.activeHIPTail.IsOpen()) {
+        DPRINT("%d: end = %d\n", iThread, acc.HIPtb);
+        acc.activeHIPTail.end = acc.HIPtb;
+      }
+
+      CloseHIPTails(smem, clusterer, iThread, nThreads, iPadHandle, basePos, chargeMap, acc, shouldCloseTail);
+
+      GPUbarrier();
+
+      if (acc.HIPtb > -1) {
+        DPRINT("%d: start = %d\n", iThread, acc.HIPtb);
+        acc.activeHIPTail.SetOpen(acc.HIPtb);
+        acc.tailFilterCharge = Charge(MaxADC);
+      }
+
+      // Clear smem between iterations to prevent stale entries
+      if (handlePad) {
+        smem.tails[iPadHandle].Reset();
+      }
+
+      GPUbarrier();
+
+    } // if (hipTriggerFound)
+
+  } // for (uint16_t t = firstTB; t < lastTB; t += NumOfCachedTBs)
 
   if (handlePad) {
-    updatePadBaseline(basePos.gpad + iPadHandle, clusterer, totalCharges, maxConsecCharges, maxCharge);
+    updatePadBaseline(basePos.gpad + iPadHandle, clusterer, acc.totalCharges, acc.maxConsecCharges, acc.maxCharge);
   }
+
+  // --- Close remaining tails
+  const bool shouldCloseTail = acc.activeHIPTail.HasValue();
+
+  // Call `work_group_any` here, instead of always counting.
+  // This is important as `work_group_count` is a lot slower
+  // and has a lot of overhead if no HIPs were found.
+  if (work_group_any(shouldCloseTail)) {
+    if (shouldCloseTail && acc.activeHIPTail.IsOpen()) {
+      acc.activeHIPTail.end = lastTB;
+    }
+
+    [[maybe_unused]] const uint16_t nClosedTails = CloseHIPTails(smem, clusterer, iThread, nThreads, iPadHandle, basePos, chargeMap, acc, shouldCloseTail);
+
+    DPRINTB_IF(nClosedTails > 0, "%d: Close remaining tails (%d)\n", iBlock, nClosedTails);
+  }
+
 #endif
 }
 
@@ -172,3 +436,131 @@ GPUd() void GPUTPCCFCheckPadBaseline::updatePadBaseline(int32_t pad, const GPUTP
     clusterer.mPpadIsNoisy[pad] = true;
   }
 }
+
+// ======== HIP Tail Connector Kernel ========
+
+template <>
+GPUd() void GPUTPCCFHIPTailConnector::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
+{
+  if (iBlock >= (int32_t)GPUTPCGeometry::NROWS) {
+    return;
+  }
+  const uint32_t row = iBlock;
+
+  const uint32_t nTails = CAMath::Min(clusterer.mPnHIPTails[row], (uint32_t)MaxHIPTailsPerRow - 1);
+
+  // HIP TAILS: indexing starts at 1, so 0 index indicates no connection
+  HIPTailDescriptor* tails = GetHIPTails(clusterer, row);
+
+  for (uint32_t iTail = iThread + 1; iTail <= nTails; iTail += nThreads) {
+    auto* tail = &tails[iTail];
+
+    // TODO: this is needed because tailStarts may vary due to rising edge
+    // Better approach would be to also track the triggered timebin and match that instead
+    uint16_t overlapWindowStart = tail->tailStart >= 5 ? tail->tailStart - 5 : 0;
+    uint16_t overlapWindowEnd = tail->tailStart + 5;
+
+    for (uint32_t jTail = iTail + 1; jTail <= nTails; jTail++) {
+      auto* tailNext = &tails[jTail];
+      if (tailNext->iPrev > 0) {
+        continue;
+      }
+
+      const bool overlapPad = tailNext->pad >= tail->pad - GPUTPCCFCheckPadBaseline::SSClusterPadWidth && tailNext->pad <= tail->pad + GPUTPCCFCheckPadBaseline::SSClusterPadWidth;
+      const bool overlapTime = tailNext->tailStart >= overlapWindowStart && tailNext->tailStart < overlapWindowEnd;
+
+      if (overlapPad && overlapTime) {
+        if (CAMath::AtomicCAS(&tailNext->iPrev, 0u, iTail)) {
+          tail->iNext = jTail;
+          break;
+        }
+      }
+    }
+  }
+}
+
+// ======== HIP Clusterizer Kernel ========
+
+template <>
+GPUd() void GPUTPCCFHIPClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
+{
+  if (iBlock >= (int32_t)GPUTPCGeometry::NROWS) {
+    return;
+  }
+
+  const uint32_t row = iBlock;
+  uint32_t nTails = clusterer.mPnHIPTails[row];
+  nTails = CAMath::Min(nTails, (uint32_t)MaxHIPTailsPerRow - 1);
+
+  HIPTailDescriptor* tails = GetHIPTails(clusterer, row);
+  const auto& fragment = clusterer.mPmemory->fragment;
+
+  tpccf::SizeT nCreatedClusters = 0;
+  for (uint32_t iTail = iThread + 1; iTail <= nTails; iTail += nThreads) {
+
+    auto* tail = &tails[iTail];
+
+    if (tail->iPrev != 0) {
+      continue;
+    }
+
+    float qTot = tail->qTot;
+    float qMax = tail->qMax;
+    const float firstWeight = tail->qTot;
+    const float firstPad = tail->pad;
+    const float firstTime = HIPTailTimeMean(*tail);
+    const float firstTimeVariance = HIPTailTimeVariance(*tail);
+    float padSum = firstWeight * firstPad;
+    float padSqSum = firstWeight * firstPad * firstPad;
+    float timeSum = firstWeight * firstTime;
+    float timeSqSum = firstWeight * (firstTime * firstTime + firstTimeVariance);
+
+    while (tail->iNext != 0) {
+
+      tail = &tails[tail->iNext];
+
+      const float tailWeight = tail->qTot;
+      const float tailPad = tail->pad;
+      const float tailTime = HIPTailTimeMean(*tail);
+      const float tailTimeVariance = HIPTailTimeVariance(*tail);
+      qMax = CAMath::Max(qMax, tail->qMax);
+      qTot += tail->qTot;
+      padSum += tailWeight * tailPad;
+      padSqSum += tailWeight * tailPad * tailPad;
+      timeSum += tailWeight * tailTime;
+      timeSqSum += tailWeight * (tailTime * tailTime + tailTimeVariance);
+    }
+
+    const float weightSum = CAMath::Max(qTot, 1.f);
+    float padMean = padSum / weightSum;
+    float timeMean = timeSum / weightSum; // TODO: Use timebin of saturated signal instead! Time mean is biased for long tails.
+    float padSigma = CAMath::Sqrt(CAMath::Max(0.f, padSqSum / weightSum - padMean * padMean));
+    float timeSigma = CAMath::Sqrt(CAMath::Max(0.f, timeSqSum / weightSum - timeMean * timeMean));
+
+    o2::tpc::ClusterNative cn;
+    cn.qMax = qMax;
+    cn.qTot = (uint16_t)CAMath::Min(qTot, 65535.f);
+    float clusterTime = fragment.start + timeMean - clusterer.Param().rec.tpc.clustersShiftTimebinsClusterizer;
+    cn.setTimeFlags(clusterTime, 0);
+    cn.setPad(padMean);
+    cn.setSigmaTime(timeSigma);
+    cn.setSigmaPad(padSigma);
+
+    if (cn.qMax >= 1023) {
+      // Cut off clusters where the tail connection failed for some reason
+      // TODO: Deduplicate with GPUTPCCFClusterizer::sortIntoBuckets (can't call cross-kernel).
+      // TODO: Add error reporting for row cluster overflow.
+      uint32_t index = CAMath::AtomicAdd(&clusterer.mPclusterInRow[row], 1u);
+      if (index < clusterer.mNMaxClusterPerRow) {
+        clusterer.mPclusterByRow[clusterer.mNMaxClusterPerRow * row + index] = cn;
+        nCreatedClusters++;
+      }
+    }
+  }
+
+#if defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
+  CAMath::AtomicAdd(reinterpret_cast<unsigned long long*>(&clusterer.mPmemory->counters.nClusters), (unsigned long long)nCreatedClusters);
+#else
+  CAMath::AtomicAdd(&clusterer.mPmemory->counters.nClusters, nCreatedClusters);
+#endif
+}
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h
index 7638b95ee7f0b..f78f91a548ac9 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h
@@ -15,6 +15,11 @@
 /// Kernel identifies noisy TPC pads by analyzing charge patterns over time.
 /// A pad is marked noisy if it exceeds thresholds for total or consecutive
 /// time bins with charge, unless the charge exceeds a saturation threshold.
+///
+/// Optionally detects Highly Ionising Particle (HIP) tails: when a saturated
+/// ADC value (1023) is found, the tail region on the triggering pad and its
+/// neighbors is zeroed in the charge map until an exponential charge filter
+/// drops below a configurable threshold.
 
 #ifndef O2_GPU_GPU_TPC_CF_CHECK_PAD_BASELINE_H
 #define O2_GPU_GPU_TPC_CF_CHECK_PAD_BASELINE_H
@@ -29,6 +34,16 @@
 namespace o2::gpu
 {
 
+struct HIPTailDescriptor {
+  uint32_t iPrev;
+  uint32_t iNext;
+  uint16_t pad;
+  uint16_t tailStart;
+  uint16_t tailEnd;
+  float qTot;
+  float qMax;
+};
+
 class GPUTPCCFCheckPadBaseline : public GPUKernelTemplate
 {
 
@@ -39,15 +54,65 @@ class GPUTPCCFCheckPadBaseline : public GPUKernelTemplate
     EntriesPerCacheline = PadsPerCacheline * TimebinsPerCacheline,
     NumOfCachedPads = GPUCA_WARP_SIZE / TimebinsPerCacheline,
     NumCLsPerWarp = GPUCA_WARP_SIZE / EntriesPerCacheline,
-    NumOfCachedTBs = TimebinsPerCacheline,
+    NumOfCachedTBs = TimebinsPerCacheline * 8,
     // Threads index shared memory as [iThread / MaxNPadsPerRow][iThread % MaxNPadsPerRow].
     // Rounding up to a multiple of PadsPerCacheline ensures iThread / MaxNPadsPerRow < NumOfCachedTBs
     // for all threads, avoiding out-of-bounds access.
     MaxNPadsPerRow = CAMath::nextMultipleOf<PadsPerCacheline>(GPUTPCGeometry::MaxNPadsPerRow()),
+
+    MaxADC = 1023,
+
+    NThreads = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFCheckPadBaseline),
+    SSClusterPadWidth = 5,
   };
 
-  struct GPUSharedMemory {
+  union HipTailRange {
+    struct {
+      int16_t start;
+      int16_t end;
+    };
+
+    // Be careful with using default initialized values.
+    // Need default constructor, so can be placed in shared memory.
+    // Might be zero initialized, but invalid tail needs start = end = -1 instead.
+    GPUdDefault() HipTailRange() = default;
+    GPUdi() HipTailRange(int16_t st, int16_t e) : start(st), end(e) {}
+
+    GPUdi() bool HasValue() const { return start > -1; }
+    GPUdi() bool IsOpen() const { return start > -1 && end < 0; }
+
+    GPUdi() void SetOpen(int16_t st)
+    {
+      start = st;
+      end = -1;
+    }
+
+    GPUdi() int16_t Length() const { return end - start; }
+
+    GPUdi() void Reset() { start = end = -1; }
+  };
+
+  struct GPUSharedMemory : public GPUKernelTemplate::GPUSharedMemoryScan64<int16_t, NThreads> {
     tpccf::Charge charges[NumOfCachedTBs][MaxNPadsPerRow];
+    HipTailRange tails[MaxNPadsPerRow];
+    uint8_t tailsClosedPad[MaxNPadsPerRow];
+    HipTailRange tailsClosed[MaxNPadsPerRow];
+    uint32_t tailsClosedStoreIdx[MaxNPadsPerRow];
+    tpccf::Charge tailQTotScratch[NThreads];
+    tpccf::Charge tailQMaxScratch[NThreads];
+    uint32_t tailStoreBase;
+  };
+
+  // Accumulated values from scanning cached charges in a pad
+  struct PadChargeAccu {
+    int32_t totalCharges = 0;
+    int32_t consecCharges = 0;
+    int32_t maxConsecCharges = 0;
+    tpccf::Charge maxCharge = 0;
+    int16_t HIPtb = -1;
+    int16_t aboveThresholdStart = -1; // first TB of current above-hipTailThreshold streak; used to extend the tail back over the rising edge before saturation
+    HipTailRange activeHIPTail{-1, -1};
+    tpccf::Charge tailFilterCharge = 0;
   };
 
   typedef GPUTPCClusterFinder processorType;
@@ -79,6 +144,58 @@ class GPUTPCCFCheckPadBaseline : public GPUKernelTemplate
   GPUd() static void updatePadBaseline(int32_t pad, const GPUTPCClusterFinder&, int32_t totalCharges, int32_t consecCharges, tpccf::Charge maxCharge);
 };
 
+class GPUTPCCFHIPTailConnector : public GPUKernelTemplate
+{
+ public:
+  enum {
+    MaxHIPTails = 1 << 15,
+    MaxHIPTailsPerRow = MaxHIPTails,
+  };
+
+  struct GPUSharedMemory {
+  };
+
+  typedef GPUTPCClusterFinder processorType;
+  GPUhdi() static processorType* Processor(GPUConstantMem& processors)
+  {
+    return processors.tpcClusterer;
+  }
+
+  GPUhdi() constexpr static gpudatatypes::RecoStep GetRecoStep()
+  {
+    return gpudatatypes::RecoStep::TPCClusterFinding;
+  }
+
+  template <int32_t iKernel = defaultKernel>
+  GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer);
+};
+
+class GPUTPCCFHIPClusterizer : public GPUKernelTemplate
+{
+ public:
+  enum {
+    MaxHIPTails = GPUTPCCFHIPTailConnector::MaxHIPTails,
+    MaxHIPTailsPerRow = GPUTPCCFHIPTailConnector::MaxHIPTailsPerRow,
+  };
+
+  struct GPUSharedMemory {
+  };
+
+  typedef GPUTPCClusterFinder processorType;
+  GPUhdi() static processorType* Processor(GPUConstantMem& processors)
+  {
+    return processors.tpcClusterer;
+  }
+
+  GPUhdi() constexpr static gpudatatypes::RecoStep GetRecoStep()
+  {
+    return gpudatatypes::RecoStep::TPCClusterFinding;
+  }
+
+  template <int32_t iKernel = defaultKernel>
+  GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer);
+};
+
 } // namespace o2::gpu
 
 #endif
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx
index 7fef277138632..3d1ebbd54490e 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx
@@ -477,7 +477,10 @@ GPUd() void GPUTPCCFDecodeZSLinkBase::WriteCharge(processorType& clusterer, floa
   CfChargePos pos(padAndRow.getRow(), padAndRow.getPad(), localTime);
   positions[positionOffset] = pos;
 
-  charge *= clusterer.GetConstantMem()->calibObjects.tpcPadGain->getGainCorrection(sector, padAndRow.getRow(), padAndRow.getPad());
+  // Only apply gain correction if ADC not fully saturated
+  if (charge < 1023.f) {
+    charge *= clusterer.GetConstantMem()->calibObjects.tpcPadGain->getGainCorrection(sector, padAndRow.getRow(), padAndRow.getPad());
+  }
 
   chargeMap[pos] = PackedCharge(charge);
 }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h
index 3ad463f469cd6..74b76f6bf7598 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h
@@ -132,7 +132,7 @@ class GPUTPCCFDecodeZSLink : public GPUTPCCFDecodeZSLinkBase
 {
  public:
   // constants for decoding
-  static inline constexpr int32_t DECODE_BITS = o2::tpc::TPCZSHDRV2::TPC_ZS_NBITS_V34;
+  static inline constexpr int32_t DECODE_BITS = tpc::TPCZSHDRV2::TPC_ZS_NBITS_V34;
   static inline constexpr float DECODE_BITS_FACTOR = 1.f / (1 << (DECODE_BITS - 10));
   static inline constexpr uint32_t DECODE_MASK = (1 << DECODE_BITS) - 1;
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.cxx
index e34163d3803fe..67be936ab4627 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.cxx
@@ -25,6 +25,7 @@
 
 #include "CfChargePos.h"
 #include "CfArray2D.h"
+#include "GPUTPCCFCheckPadBaseline.h"
 
 using namespace o2::gpu;
 using namespace o2::tpc;
@@ -95,6 +96,10 @@ void* GPUTPCClusterFinder::SetPointersScratch(void* mem)
   if ((mRec->GetRecoStepsGPU() & gpudatatypes::RecoStep::TPCClusterFinding)) {
     computePointerWithAlignment(mem, mPscanBuf, mBufSize * mNBufs);
   }
+  // TODO: Use memory scalers for MaxHIPTails.
+  // NOTE: Always allocate since Param() is not available during size computation.
+  computePointerWithAlignment(mem, mPhipTailsByRow, GPUTPCGeometry::NROWS * GPUTPCCFHIPClusterizer::MaxHIPTailsPerRow);
+  computePointerWithAlignment(mem, mPnHIPTails, GPUTPCGeometry::NROWS);
   return mem;
 }
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
index 4d036c2056cc5..bc49d225133fa 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -46,6 +46,7 @@ namespace o2::gpu
 {
 struct GPUTPCClusterMCInterimArray;
 struct TPCPadGainCalib;
+struct HIPTailDescriptor;
 
 struct CfChargePos;
 
@@ -113,6 +114,8 @@ class GPUTPCClusterFinder : public GPUProcessor
   tpc::ClusterNative* mPclusterByRow = nullptr;
   GPUTPCClusterMCInterimArray* mPlabelsByRow = nullptr;
   int32_t* mPscanBuf = nullptr;
+  HIPTailDescriptor* mPhipTailsByRow = nullptr;
+  uint32_t* mPnHIPTails = nullptr; // one counter per row
   Memory* mPmemory = nullptr;
 
   GPUdi() int32_t* GetScanBuffer(int32_t iBuf) const { return mPscanBuf + iBuf * mBufSize; }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx
index 1e5030956df01..09440825681f4 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx
@@ -15,7 +15,6 @@
 #include "GPUTPCClusterFinder.h"
 #include "GPUReconstruction.h"
 #include "CfArray2D.h"
-#include "DataFormatsTPC/Digit.h"
 #include "DataFormatsTPC/ClusterNative.h"
 #include "GPUSettings.h"
 
@@ -155,7 +154,7 @@ void GPUTPCClusterFinder::DumpSuppressedPeaksCompacted(std::ostream& out)
 
 void GPUTPCClusterFinder::DumpClusters(std::ostream& out)
 {
-  out << "\nClusterer - Clusters - Sector " << mISector << " - Fragment " << mPmemory->fragment.index << "\n";
+  out << "\nClusterer - Clusters - Sector " << mISector << " - All Fragments\n";
 
   for (uint32_t i = 0; i < GPUTPCGeometry::NROWS; i++) {
     size_t N = mPclusterInRow[i];
diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
index 2176ea2dc3804..3041c2b869de2 100644
--- a/GPU/GPUTracking/kernels.cmake
+++ b/GPU/GPUTracking/kernels.cmake
@@ -103,6 +103,8 @@ o2_gpu_add_kernel("GPUTPCDecompressionUtilKernels, sortPerSectorRow"      "GPUTP
 o2_gpu_add_kernel("GPUTPCDecompressionUtilKernels, countFilteredClusters" "GPUTPCDecompressionKernels"                            LB)
 o2_gpu_add_kernel("GPUTPCDecompressionUtilKernels, storeFilteredClusters" "GPUTPCDecompressionKernels"                            LB)
 o2_gpu_add_kernel("GPUTPCCFCheckPadBaseline"                              "= TPCCLUSTERFINDER"                                    LB)
+o2_gpu_add_kernel("GPUTPCCFHIPTailConnector"                              "GPUTPCCFCheckPadBaseline TPCCLUSTERFINDER"             LB)
+o2_gpu_add_kernel("GPUTPCCFHIPClusterizer"                                "GPUTPCCFCheckPadBaseline TPCCLUSTERFINDER"             LB)
 o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, fillIndexMap"                 "= TPCCLUSTERFINDER"                                    LB)
 o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, fillFromDigits"               "= TPCCLUSTERFINDER"                                    LB)
 o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, findFragmentStart"            "= TPCCLUSTERFINDER"                                    LB int8_t setPositions)
diff --git a/GPU/GPUTracking/utils/VcShim.h b/GPU/GPUTracking/utils/VcShim.h
index 21a9a6a5c95c2..2bbc1d471bbbb 100644
--- a/GPU/GPUTracking/utils/VcShim.h
+++ b/GPU/GPUTracking/utils/VcShim.h
@@ -19,7 +19,7 @@
 
 #ifndef GPUCA_NO_VC
 
-#include <Vc/Vc>
+#include <Vc/Vc> // IWYU pragma: export
 
 #else
 
diff --git a/log.txt b/log.txt
deleted file mode 100644
index e69de29bb2d1d..0000000000000