Classify PITR slicer errors as fatal vs retriable to stop infinite restart loop

jcechace · jcechace · commit b83ba600f76b · 2026-03-04T13:15:15.000+01:00
When a slicer fails with a permanent error (e.g. insufficient oplog range,
missing base backup), PITR now stops cluster-wide and stays stopped until
a successful backup clears the error state. Previously, leadNomination
unconditionally called InitMeta which cleared the error, causing an
infinite fail-stop-restart loop where healthy replica sets accumulated
useless oplog chunks.
diff --git a/cmd/pbm-agent/backup.go b/cmd/pbm-agent/backup.go
@@ -11,6 +11,7 @@ import (
 	"github.com/percona/percona-backup-mongodb/pbm/errors"
 	"github.com/percona/percona-backup-mongodb/pbm/lock"
 	"github.com/percona/percona-backup-mongodb/pbm/log"
+	"github.com/percona/percona-backup-mongodb/pbm/oplog"
 	"github.com/percona/percona-backup-mongodb/pbm/prio"
 	"github.com/percona/percona-backup-mongodb/pbm/storage"
 	"github.com/percona/percona-backup-mongodb/pbm/topo"
@@ -258,6 +259,19 @@ func (a *Agent) Backup(ctx context.Context, cmd *ctrl.BackupCmd, opid ctrl.OPID,
 		}
 	} else {
 		l.Info("backup finished")
+
+		// A successful backup creates a new PITR starting point.
+		// Clear any fatal PITR error so the supervisor can restart slicing.
+		if nodeInfo.IsLeader() {
+			status, serr := oplog.GetClusterStatus(ctx, a.leadConn)
+			if serr == nil && status == oplog.StatusError {
+				if ierr := oplog.SetClusterStatus(ctx, a.leadConn, oplog.StatusUnset); ierr != nil {
+					l.Warning("clear PITR error status: %v", ierr)
+				} else {
+					l.Info("PITR error state cleared after successful backup")
+				}
+			}
+		}
 	}
 }
 
diff --git a/cmd/pbm-agent/pitr.go b/cmd/pbm-agent/pitr.go
@@ -245,6 +245,14 @@ func (a *Agent) pitr(ctx context.Context) error {
 		return nil
 	}
 
+	// If PITR is in a fatal error state (e.g. no base backup), skip
+	// nomination and wait cycles. A new backup is needed to clear this.
+	cStatus, serr := oplog.GetClusterStatus(ctx, a.leadConn)
+	if serr == nil && cStatus == oplog.StatusError {
+		l.Info("pitr is in error state, new backup is required to resume")
+		return nil
+	}
+
 	if nodeInfo.IsClusterLeader() {
 		// start monitor jobs on cluster leader
 		a.startMon(ctx, cfg)
@@ -284,8 +292,12 @@ func (a *Agent) pitr(ctx context.Context) error {
 	}
 
 	defer func() {
-		if err != nil {
-			l.Debug("setting RS error status for err: %v", err)
+		if err == nil {
+			return
+		}
+		l.Debug("setting RS error status for err: %v", err)
+		var fatalErr slicer.FatalSlicerError
+		if errors.As(err, &fatalErr) {
 			if err := oplog.SetErrorRSStatus(ctx, a.leadConn, nodeInfo.SetName, nodeInfo.Me, err.Error()); err != nil {
 				l.Error("error while setting error status: %v", err)
 			}
@@ -379,10 +391,19 @@ func (a *Agent) pitr(ctx context.Context) error {
 			monitorPrio,
 		)
 		if streamErr != nil {
-			l.Error("streaming oplog: %v", streamErr)
-			retErr := errors.Wrap(streamErr, "streaming oplog")
-			if err := oplog.SetErrorRSStatus(ctx, a.leadConn, nodeInfo.SetName, nodeInfo.Me, retErr.Error()); err != nil {
-				l.Error("setting RS status to StatusError: %v", err)
+			var movedErr slicer.OpMovedError
+			var fatalErr slicer.FatalSlicerError
+
+			switch {
+			case errors.As(streamErr, &movedErr):
+				l.Info("streaming stopped: %v", streamErr)
+			case errors.As(streamErr, &fatalErr):
+				l.Error("streaming oplog: %v", streamErr)
+				if err := oplog.SetErrorRSStatus(ctx, a.leadConn, nodeInfo.SetName, nodeInfo.Me, streamErr.Error()); err != nil {
+					l.Error("setting RS status to StatusError: %v", err)
+				}
+			default:
+				l.Error("streaming oplog: %v", streamErr)
 			}
 		}
 
@@ -402,6 +423,16 @@ func (a *Agent) leadNomination(
 ) {
 	l := log.LogEventFromContext(ctx)
 
+	status, err := oplog.GetClusterStatus(ctx, a.leadConn)
+	if err != nil && !errors.Is(err, errors.ErrNotFound) {
+		l.Error("get cluster status: %v", err)
+		return
+	}
+	if status == oplog.StatusError {
+		l.Info("pitr is in error state, new backup is required to resume")
+		return
+	}
+
 	l.Debug("checking locks in the whole cluster")
 	noLocks, err := a.waitAllOpLockRelease(ctx)
 	if err != nil {
@@ -884,11 +915,14 @@ func (a *Agent) pitrErrorMonitor(ctx context.Context) {
 				continue
 			}
 
-			l.Debug("error while executing pitr, pitr procedure will be restarted")
+			l.Debug("error while executing pitr, pitr procedure will be stopped")
 			err = oplog.SetClusterStatus(ctx, a.leadConn, oplog.StatusError)
 			if err != nil {
 				l.Error("error while setting cluster status Error: %v", err)
 			}
+			a.removePitr()
+			a.stopMon()
+			return
 
 		case <-ctx.Done():
 			return
diff --git a/pbm/slicer/slicer.go b/pbm/slicer/slicer.go
@@ -84,7 +84,7 @@ func (s *Slicer) Catchup(ctx context.Context) error {
 	lastBackup, err := backup.GetLastBackup(ctx, s.leadClient, nil)
 	if err != nil {
 		if errors.Is(err, errors.ErrNotFound) {
-			err = errors.New("no backup found. full backup is required to start PITR")
+			return FatalSlicerError{errors.New("no backup found. full backup is required to start PITR")}
 		}
 		return errors.Wrap(err, "get last backup")
 	}
@@ -98,19 +98,19 @@ func (s *Slicer) Catchup(ctx context.Context) error {
 		}
 	}
 	if rs == nil {
-		return errors.Errorf("no replset %q in the last backup %q. "+
+		return FatalSlicerError{errors.Errorf("no replset %q in the last backup %q. "+
 			"full backup is required to start PITR",
-			s.rs, lastBackup.Name)
+			s.rs, lastBackup.Name)}
 	}
 
 	lastRestore, err := restore.GetLastRestore(ctx, s.leadClient)
 	if err != nil && !errors.Is(err, errors.ErrNotFound) {
 		return errors.Wrap(err, "get last restore")
 	}
 	if lastRestore != nil && lastBackup.StartTS < lastRestore.StartTS {
-		return errors.Errorf("no backup found after the restored %s, "+
+		return FatalSlicerError{errors.Errorf("no backup found after the restored %s, "+
 			"a new backup is required to resume PITR",
-			lastRestore.Backup)
+			lastRestore.Backup)}
 	}
 
 	lastChunk, err := oplog.PITRLastChunkMeta(ctx, s.leadClient, s.rs)
@@ -220,7 +220,7 @@ func (s *Slicer) OplogOnlyCatchup(ctx context.Context) error {
 		return errors.Wrapf(err, "check oplog sufficiency for %v", lastChunk)
 	}
 	if !ok {
-		return oplog.InsuffRangeError{lastChunk.EndTS}
+		return FatalSlicerError{oplog.InsuffRangeError{lastChunk.EndTS}}
 	}
 
 	s.lastTS = lastChunk.EndTS
@@ -291,6 +291,16 @@ func (e OpMovedError) Is(err error) bool {
 	return ok
 }
 
+// FatalSlicerError wraps errors that are permanent and cannot be resolved
+// by retrying. PITR should not restart until the underlying issue is fixed
+// (e.g. a new backup is taken).
+type FatalSlicerError struct {
+	Err error
+}
+
+func (e FatalSlicerError) Error() string { return e.Err.Error() }
+func (e FatalSlicerError) Unwrap() error { return e.Err }
+
 // LogStartMsg message to log on successful streaming start
 const LogStartMsg = "start_ok"
 
@@ -321,7 +331,7 @@ func (s *Slicer) Stream(
 		return errors.Wrap(err, "check oplog sufficiency")
 	}
 	if !ok {
-		return oplog.InsuffRangeError{s.lastTS}
+		return FatalSlicerError{oplog.InsuffRangeError{s.lastTS}}
 	}
 	s.l.Debug(LogStartMsg)
 

Original file line number	Diff line number	Diff line change
`@@ -84,7 +84,7 @@ func (s *Slicer) Catchup(ctx context.Context) error {`
`84`	`84`	`lastBackup, err := backup.GetLastBackup(ctx, s.leadClient, nil)`
`85`	`85`	`if err != nil {`
`86`	`86`	`if errors.Is(err, errors.ErrNotFound) {`
`87`		`- err = errors.New("no backup found. full backup is required to start PITR")`
	`87`	`+ return FatalSlicerError{errors.New("no backup found. full backup is required to start PITR")}`
`88`	`88`	`}`
`89`	`89`	`return errors.Wrap(err, "get last backup")`
`90`	`90`	`}`
`@@ -98,19 +98,19 @@ func (s *Slicer) Catchup(ctx context.Context) error {`
`98`	`98`	`}`
`99`	`99`	`}`
`100`	`100`	`if rs == nil {`
`101`		`- return errors.Errorf("no replset %q in the last backup %q. "+`
	`101`	`+ return FatalSlicerError{errors.Errorf("no replset %q in the last backup %q. "+`
`102`	`102`	`"full backup is required to start PITR",`
`103`		`- s.rs, lastBackup.Name)`
	`103`	`+ s.rs, lastBackup.Name)}`
`104`	`104`	`}`
`105`	`105`
`106`	`106`	`lastRestore, err := restore.GetLastRestore(ctx, s.leadClient)`
`107`	`107`	`if err != nil && !errors.Is(err, errors.ErrNotFound) {`
`108`	`108`	`return errors.Wrap(err, "get last restore")`
`109`	`109`	`}`
`110`	`110`	`if lastRestore != nil && lastBackup.StartTS < lastRestore.StartTS {`
`111`		`- return errors.Errorf("no backup found after the restored %s, "+`
	`111`	`+ return FatalSlicerError{errors.Errorf("no backup found after the restored %s, "+`
`112`	`112`	`"a new backup is required to resume PITR",`
`113`		`- lastRestore.Backup)`
	`113`	`+ lastRestore.Backup)}`
`114`	`114`	`}`
`115`	`115`
`116`	`116`	`lastChunk, err := oplog.PITRLastChunkMeta(ctx, s.leadClient, s.rs)`
`@@ -220,7 +220,7 @@ func (s *Slicer) OplogOnlyCatchup(ctx context.Context) error {`
`220`	`220`	`return errors.Wrapf(err, "check oplog sufficiency for %v", lastChunk)`
`221`	`221`	`}`
`222`	`222`	`if !ok {`
`223`		`- return oplog.InsuffRangeError{lastChunk.EndTS}`
	`223`	`+ return FatalSlicerError{oplog.InsuffRangeError{lastChunk.EndTS}}`
`224`	`224`	`}`
`225`	`225`
`226`	`226`	`s.lastTS = lastChunk.EndTS`
`@@ -291,6 +291,16 @@ func (e OpMovedError) Is(err error) bool {`
`291`	`291`	`return ok`
`292`	`292`	`}`
`293`	`293`
	`294`	`+// FatalSlicerError wraps errors that are permanent and cannot be resolved`
	`295`	`+// by retrying. PITR should not restart until the underlying issue is fixed`
	`296`	`+// (e.g. a new backup is taken).`
	`297`	`+type FatalSlicerError struct {`
	`298`	`+ Err error`
	`299`	`+}`
	`300`	`+`
	`301`	`+func (e FatalSlicerError) Error() string { return e.Err.Error() }`
	`302`	`+func (e FatalSlicerError) Unwrap() error { return e.Err }`
	`303`	`+`
`294`	`304`	`// LogStartMsg message to log on successful streaming start`
`295`	`305`	`const LogStartMsg = "start_ok"`
`296`	`306`
`@@ -321,7 +331,7 @@ func (s *Slicer) Stream(`
`321`	`331`	`return errors.Wrap(err, "check oplog sufficiency")`
`322`	`332`	`}`
`323`	`333`	`if !ok {`
`324`		`- return oplog.InsuffRangeError{s.lastTS}`
	`334`	`+ return FatalSlicerError{oplog.InsuffRangeError{s.lastTS}}`
`325`	`335`	`}`
`326`	`336`	`s.l.Debug(LogStartMsg)`
`327`	`337`