From 25a3d55fb5ea8523a33eba28a2066fa9409d77c8 Mon Sep 17 00:00:00 2001 From: Manuel Grandeit Date: Wed, 24 Jun 2026 14:41:01 +0200 Subject: [PATCH] Rescan NVMe namespaces when a published namespace device is missing When a namespace is mapped to an already-connected NVMe subsystem, the host only creates its device node in response to an asynchronous notification from the controller. If that notification is missed, the device never appears and NodeStageVolume keeps failing with "no device found for the given namespace". Recover the namespace by issuing "nvme ns-rescan" on the subsystem's controllers: from the AttachNVMeVolume retry loop when the device is missing on an already-connected subsystem, and from NVMe self-healing when a published namespace has no device on the host. The self-healing case mirrors iSCSI self-healing, which already rescans the host for devices. A rescan only adds missing namespaces and never renames existing devices, so it is safe on a connected subsystem. Signed-off-by: Manuel Grandeit --- utils/nvme/nvme.go | 62 +++++++++++++++++++++++++++++++++++++- utils/nvme/nvme_darwin.go | 7 +++++ utils/nvme/nvme_linux.go | 31 +++++++++++++++++++ utils/nvme/nvme_types.go | 2 ++ utils/nvme/nvme_windows.go | 7 +++++ 5 files changed, 108 insertions(+), 1 deletion(-) diff --git a/utils/nvme/nvme.go b/utils/nvme/nvme.go index 898321f1a..956b1cf7e 100644 --- a/utils/nvme/nvme.go +++ b/utils/nvme/nvme.go @@ -300,6 +300,17 @@ func (nh *NVMeHandler) AttachNVMeVolume( nvmeDev, err := nvmeSubsys.GetNVMeDevice(ctx, publishInfo.NVMeNamespaceUUID) if err != nil { + // Subsystem is connected but the namespace device is absent, likely a missed namespace + // notification. Trigger a rescan so a subsequent attach retry can find the device. + if connectionStatus == NVMeSubsystemConnected && errors.IsNotFoundError(err) { + if rescanErr := nvmeSubsys.RescanNamespaces(ctx); rescanErr != nil { + Logc(ctx).WithError(rescanErr).Warning( + "Failed to rescan NVMe namespaces after namespace device was not found.") + } else { + Logc(ctx).WithField("namespace", publishInfo.NVMeNamespaceUUID).Debug( + "Triggered NVMe namespace rescan after namespace device was not found.") + } + } return err } devPath := nvmeDev.GetPath() @@ -679,6 +690,14 @@ func (nh *NVMeHandler) InspectNVMeSessions( pubSessionData.SetRemediation(ConnectOp) subsToFix = append(subsToFix, currSessionData.Subsystem) continue + case NVMeSubsystemConnected: + // All paths are up but a published namespace may still be missing its device after a + // missed namespace notification. Schedule a rescan to recover it. + if nh.subsystemHasMissingNamespace(ctx, &currSessionData.Subsystem, pubSessionData) { + pubSessionData.SetRemediation(RescanOp) + subsToFix = append(subsToFix, currSessionData.Subsystem) + continue + } } // All/None of the paths are present for the subsystem @@ -689,6 +708,39 @@ func (nh *NVMeHandler) InspectNVMeSessions( return subsToFix } +// subsystemHasMissingNamespace reports whether any namespace published on the subsystem is missing +// its device on the host, a condition a namespace rescan can recover. +func (nh *NVMeHandler) subsystemHasMissingNamespace( + ctx context.Context, sub *NVMeSubsystem, sessionData *NVMeSessionData, +) bool { + if sessionData == nil { + return false + } + + if present, err := afero.DirExists(sub.osFs, sub.Name); err != nil || !present { + return false + } + + for nsUUID := range sessionData.Namespaces { + _, err := sub.GetNVMeDeviceAt(ctx, nsUUID) + if err == nil { + continue + } + if errors.IsNotFoundError(err) { + Logc(ctx).WithFields(LogFields{ + "subsystem": sub.NQN, + "namespace": nsUUID, + }).Warning("Published NVMe namespace has no device on host; scheduling namespace rescan.") + return true + } + + Logc(ctx).WithError(err).WithField("namespace", nsUUID).Debug( + "Error while checking NVMe namespace device presence during self-healing.") + } + + return false +} + // RectifyNVMeSession applies the required remediation on the subsystemToFix to make it working again. func (nh *NVMeHandler) RectifyNVMeSession( ctx context.Context, subsystemToFix NVMeSubsystem, pubSessions *NVMeSessions, @@ -705,12 +757,20 @@ func (nh *NVMeHandler) RectifyNVMeSession( // Updating the access time as we are trying to do some NVMeOperation on this subsystem. pubSessionData.LastAccessTime = time.Now() - if pubSessionData.Remediation == ConnectOp { + switch pubSessionData.Remediation { + case ConnectOp: if err := subsystemToFix.Connect(ctx, pubSessionData.NVMeTargetIPs, true); err != nil { Logc(ctx).Errorf("NVMe Self healing failed for subsystem %s; %v", subsystemToFix.NQN, err) } else { Logc(ctx).Infof("NVMe Self healing succeeded for %s", subsystemToFix.NQN) } + case RescanOp: + if err := subsystemToFix.RescanNamespaces(ctx); err != nil { + Logc(ctx).Errorf("NVMe Self healing (namespace rescan) failed for subsystem %s; %v", + subsystemToFix.NQN, err) + } else { + Logc(ctx).Infof("NVMe Self healing (namespace rescan) succeeded for %s", subsystemToFix.NQN) + } } } diff --git a/utils/nvme/nvme_darwin.go b/utils/nvme/nvme_darwin.go index 232bbbb44..9497786da 100644 --- a/utils/nvme/nvme_darwin.go +++ b/utils/nvme/nvme_darwin.go @@ -46,6 +46,13 @@ func (s *NVMeSubsystem) DisconnectSubsystemFromHost(ctx context.Context) error { return errors.UnsupportedError("DisconnectSubsystemFromHost is not supported for darwin") } +// RescanNamespaces re-enumerates the namespaces on the subsystem's controllers. +func (s *NVMeSubsystem) RescanNamespaces(ctx context.Context) error { + Logc(ctx).Debug(">>>> nvme_darwin.RescanNamespaces") + defer Logc(ctx).Debug("<<<< nvme_darwin.RescanNamespaces") + return errors.UnsupportedError("RescanNamespaces is not supported for darwin") +} + func (nh *NVMeHandler) GetNVMeSubsystem(ctx context.Context, nqn string) (*NVMeSubsystem, error, ) { diff --git a/utils/nvme/nvme_linux.go b/utils/nvme/nvme_linux.go index dc7bcae72..99116af16 100644 --- a/utils/nvme/nvme_linux.go +++ b/utils/nvme/nvme_linux.go @@ -351,6 +351,37 @@ func (s *NVMeSubsystem) GetNVMeDeviceAt(ctx context.Context, nsUUID string) (*NV return nil, errors.NotFoundError("no device found for the given namespace %v", nsUUID) } +// RescanNamespaces re-enumerates the namespaces on each of the subsystem's controllers using +// "nvme ns-rescan". It recovers namespaces that are mapped but missing on the host; it only adds +// namespaces and never renames existing devices, so it is safe on a connected subsystem. +func (s *NVMeSubsystem) RescanNamespaces(ctx context.Context) error { + Logc(ctx).Debug(">>>> nvme_linux.RescanNamespaces") + defer Logc(ctx).Debug("<<<< nvme_linux.RescanNamespaces") + + if len(s.Paths) == 0 { + return fmt.Errorf("no paths present for subsystem %s; cannot rescan namespaces", s.NQN) + } + + failed := 0 + for _, path := range s.Paths { + // path.Name is the controller's sysfs path, e.g. .../nvme0; the device is /dev/nvme0. + controller := "/dev/" + path.Name[strings.LastIndex(path.Name, "/")+1:] + if _, err := s.command.Execute(ctx, "nvme", "ns-rescan", controller); err != nil { + Logc(ctx).WithError(err).Errorf("Failed to rescan namespaces on controller %s.", controller) + failed++ + continue + } + Logc(ctx).WithField("controller", controller).Debug("Rescanned NVMe namespaces on controller.") + } + + // Succeed if at least one controller path was rescanned successfully. + if failed == len(s.Paths) { + return fmt.Errorf("failed to rescan namespaces on all paths for subsystem %s", s.NQN) + } + + return nil +} + // FlushNVMeDevice flushes any ongoing IOs present on the NVMe device. func (d *NVMeDevice) FlushNVMeDevice(ctx context.Context) error { Logc(ctx).Debug(">>>> nvme_linux.FlushNVMeDevice") diff --git a/utils/nvme/nvme_types.go b/utils/nvme/nvme_types.go index c19463101..e38860138 100644 --- a/utils/nvme/nvme_types.go +++ b/utils/nvme/nvme_types.go @@ -113,6 +113,7 @@ type NVMeOperation int8 const ( NoOp NVMeOperation = iota ConnectOp + RescanOp ) // NVMeSessionData contains all the information related to any NVMe session. It has the subsystem information, the @@ -136,6 +137,7 @@ type NVMeSessions struct { type NVMeSubsystemInterface interface { GetConnectionStatus() NVMeSubsystemConnectionStatus Connect(ctx context.Context, nvmeTargetIps []string, connectOnly bool) error + RescanNamespaces(ctx context.Context) error Disconnect(ctx context.Context) error GetNamespaceCount(ctx context.Context) (int, error) IsNetworkPathPresent(ip string) bool diff --git a/utils/nvme/nvme_windows.go b/utils/nvme/nvme_windows.go index d18e6232a..016e27349 100644 --- a/utils/nvme/nvme_windows.go +++ b/utils/nvme/nvme_windows.go @@ -46,6 +46,13 @@ func (s *NVMeSubsystem) DisconnectSubsystemFromHost(ctx context.Context) error { return errors.UnsupportedError("DisconnectSubsystemFromHost is not supported for windows") } +// RescanNamespaces re-enumerates the namespaces on the subsystem's controllers. +func (s *NVMeSubsystem) RescanNamespaces(ctx context.Context) error { + Logc(ctx).Debug(">>>> nvme_windows.RescanNamespaces") + defer Logc(ctx).Debug("<<<< nvme_windows.RescanNamespaces") + return errors.UnsupportedError("RescanNamespaces is not supported for windows") +} + func (nh *NVMeHandler) GetNVMeSubsystem(ctx context.Context, nqn string) (*NVMeSubsystem, error) { Logc(ctx).Debug(">>>> nvme_windows.GetNVMeSubsystem") defer Logc(ctx).Debug("<<<< nvme_windows.GetNVMeSubsystem")