diff --git a/utils/nvme/nvme.go b/utils/nvme/nvme.go index 898321f1a..956b1cf7e 100644 --- a/utils/nvme/nvme.go +++ b/utils/nvme/nvme.go @@ -300,6 +300,17 @@ func (nh *NVMeHandler) AttachNVMeVolume( nvmeDev, err := nvmeSubsys.GetNVMeDevice(ctx, publishInfo.NVMeNamespaceUUID) if err != nil { + // Subsystem is connected but the namespace device is absent, likely a missed namespace + // notification. Trigger a rescan so a subsequent attach retry can find the device. + if connectionStatus == NVMeSubsystemConnected && errors.IsNotFoundError(err) { + if rescanErr := nvmeSubsys.RescanNamespaces(ctx); rescanErr != nil { + Logc(ctx).WithError(rescanErr).Warning( + "Failed to rescan NVMe namespaces after namespace device was not found.") + } else { + Logc(ctx).WithField("namespace", publishInfo.NVMeNamespaceUUID).Debug( + "Triggered NVMe namespace rescan after namespace device was not found.") + } + } return err } devPath := nvmeDev.GetPath() @@ -679,6 +690,14 @@ func (nh *NVMeHandler) InspectNVMeSessions( pubSessionData.SetRemediation(ConnectOp) subsToFix = append(subsToFix, currSessionData.Subsystem) continue + case NVMeSubsystemConnected: + // All paths are up but a published namespace may still be missing its device after a + // missed namespace notification. Schedule a rescan to recover it. + if nh.subsystemHasMissingNamespace(ctx, &currSessionData.Subsystem, pubSessionData) { + pubSessionData.SetRemediation(RescanOp) + subsToFix = append(subsToFix, currSessionData.Subsystem) + continue + } } // All/None of the paths are present for the subsystem @@ -689,6 +708,39 @@ func (nh *NVMeHandler) InspectNVMeSessions( return subsToFix } +// subsystemHasMissingNamespace reports whether any namespace published on the subsystem is missing +// its device on the host, a condition a namespace rescan can recover. +func (nh *NVMeHandler) subsystemHasMissingNamespace( + ctx context.Context, sub *NVMeSubsystem, sessionData *NVMeSessionData, +) bool { + if sessionData == nil { + return false + } + + if present, err := afero.DirExists(sub.osFs, sub.Name); err != nil || !present { + return false + } + + for nsUUID := range sessionData.Namespaces { + _, err := sub.GetNVMeDeviceAt(ctx, nsUUID) + if err == nil { + continue + } + if errors.IsNotFoundError(err) { + Logc(ctx).WithFields(LogFields{ + "subsystem": sub.NQN, + "namespace": nsUUID, + }).Warning("Published NVMe namespace has no device on host; scheduling namespace rescan.") + return true + } + + Logc(ctx).WithError(err).WithField("namespace", nsUUID).Debug( + "Error while checking NVMe namespace device presence during self-healing.") + } + + return false +} + // RectifyNVMeSession applies the required remediation on the subsystemToFix to make it working again. func (nh *NVMeHandler) RectifyNVMeSession( ctx context.Context, subsystemToFix NVMeSubsystem, pubSessions *NVMeSessions, @@ -705,12 +757,20 @@ func (nh *NVMeHandler) RectifyNVMeSession( // Updating the access time as we are trying to do some NVMeOperation on this subsystem. pubSessionData.LastAccessTime = time.Now() - if pubSessionData.Remediation == ConnectOp { + switch pubSessionData.Remediation { + case ConnectOp: if err := subsystemToFix.Connect(ctx, pubSessionData.NVMeTargetIPs, true); err != nil { Logc(ctx).Errorf("NVMe Self healing failed for subsystem %s; %v", subsystemToFix.NQN, err) } else { Logc(ctx).Infof("NVMe Self healing succeeded for %s", subsystemToFix.NQN) } + case RescanOp: + if err := subsystemToFix.RescanNamespaces(ctx); err != nil { + Logc(ctx).Errorf("NVMe Self healing (namespace rescan) failed for subsystem %s; %v", + subsystemToFix.NQN, err) + } else { + Logc(ctx).Infof("NVMe Self healing (namespace rescan) succeeded for %s", subsystemToFix.NQN) + } } } diff --git a/utils/nvme/nvme_darwin.go b/utils/nvme/nvme_darwin.go index 232bbbb44..9497786da 100644 --- a/utils/nvme/nvme_darwin.go +++ b/utils/nvme/nvme_darwin.go @@ -46,6 +46,13 @@ func (s *NVMeSubsystem) DisconnectSubsystemFromHost(ctx context.Context) error { return errors.UnsupportedError("DisconnectSubsystemFromHost is not supported for darwin") } +// RescanNamespaces re-enumerates the namespaces on the subsystem's controllers. +func (s *NVMeSubsystem) RescanNamespaces(ctx context.Context) error { + Logc(ctx).Debug(">>>> nvme_darwin.RescanNamespaces") + defer Logc(ctx).Debug("<<<< nvme_darwin.RescanNamespaces") + return errors.UnsupportedError("RescanNamespaces is not supported for darwin") +} + func (nh *NVMeHandler) GetNVMeSubsystem(ctx context.Context, nqn string) (*NVMeSubsystem, error, ) { diff --git a/utils/nvme/nvme_linux.go b/utils/nvme/nvme_linux.go index dc7bcae72..99116af16 100644 --- a/utils/nvme/nvme_linux.go +++ b/utils/nvme/nvme_linux.go @@ -351,6 +351,37 @@ func (s *NVMeSubsystem) GetNVMeDeviceAt(ctx context.Context, nsUUID string) (*NV return nil, errors.NotFoundError("no device found for the given namespace %v", nsUUID) } +// RescanNamespaces re-enumerates the namespaces on each of the subsystem's controllers using +// "nvme ns-rescan". It recovers namespaces that are mapped but missing on the host; it only adds +// namespaces and never renames existing devices, so it is safe on a connected subsystem. +func (s *NVMeSubsystem) RescanNamespaces(ctx context.Context) error { + Logc(ctx).Debug(">>>> nvme_linux.RescanNamespaces") + defer Logc(ctx).Debug("<<<< nvme_linux.RescanNamespaces") + + if len(s.Paths) == 0 { + return fmt.Errorf("no paths present for subsystem %s; cannot rescan namespaces", s.NQN) + } + + failed := 0 + for _, path := range s.Paths { + // path.Name is the controller's sysfs path, e.g. .../nvme0; the device is /dev/nvme0. + controller := "/dev/" + path.Name[strings.LastIndex(path.Name, "/")+1:] + if _, err := s.command.Execute(ctx, "nvme", "ns-rescan", controller); err != nil { + Logc(ctx).WithError(err).Errorf("Failed to rescan namespaces on controller %s.", controller) + failed++ + continue + } + Logc(ctx).WithField("controller", controller).Debug("Rescanned NVMe namespaces on controller.") + } + + // Succeed if at least one controller path was rescanned successfully. + if failed == len(s.Paths) { + return fmt.Errorf("failed to rescan namespaces on all paths for subsystem %s", s.NQN) + } + + return nil +} + // FlushNVMeDevice flushes any ongoing IOs present on the NVMe device. func (d *NVMeDevice) FlushNVMeDevice(ctx context.Context) error { Logc(ctx).Debug(">>>> nvme_linux.FlushNVMeDevice") diff --git a/utils/nvme/nvme_types.go b/utils/nvme/nvme_types.go index c19463101..e38860138 100644 --- a/utils/nvme/nvme_types.go +++ b/utils/nvme/nvme_types.go @@ -113,6 +113,7 @@ type NVMeOperation int8 const ( NoOp NVMeOperation = iota ConnectOp + RescanOp ) // NVMeSessionData contains all the information related to any NVMe session. It has the subsystem information, the @@ -136,6 +137,7 @@ type NVMeSessions struct { type NVMeSubsystemInterface interface { GetConnectionStatus() NVMeSubsystemConnectionStatus Connect(ctx context.Context, nvmeTargetIps []string, connectOnly bool) error + RescanNamespaces(ctx context.Context) error Disconnect(ctx context.Context) error GetNamespaceCount(ctx context.Context) (int, error) IsNetworkPathPresent(ip string) bool diff --git a/utils/nvme/nvme_windows.go b/utils/nvme/nvme_windows.go index d18e6232a..016e27349 100644 --- a/utils/nvme/nvme_windows.go +++ b/utils/nvme/nvme_windows.go @@ -46,6 +46,13 @@ func (s *NVMeSubsystem) DisconnectSubsystemFromHost(ctx context.Context) error { return errors.UnsupportedError("DisconnectSubsystemFromHost is not supported for windows") } +// RescanNamespaces re-enumerates the namespaces on the subsystem's controllers. +func (s *NVMeSubsystem) RescanNamespaces(ctx context.Context) error { + Logc(ctx).Debug(">>>> nvme_windows.RescanNamespaces") + defer Logc(ctx).Debug("<<<< nvme_windows.RescanNamespaces") + return errors.UnsupportedError("RescanNamespaces is not supported for windows") +} + func (nh *NVMeHandler) GetNVMeSubsystem(ctx context.Context, nqn string) (*NVMeSubsystem, error) { Logc(ctx).Debug(">>>> nvme_windows.GetNVMeSubsystem") defer Logc(ctx).Debug("<<<< nvme_windows.GetNVMeSubsystem")