@@ -34,7 +34,6 @@ const (
3434 criuPath = "/tmp/criu"
3535
3636 // Metadata storage paths
37- cudaCmdFileName = "cuda-cmd"
3837 checkpointSubdirName = "checkpoint"
3938)
4039
@@ -142,20 +141,6 @@ func (c *checkpointer) Checkpoint(ctx context.Context, cogletCmd *exec.Cmd, wait
142141
143142 cudaPID := strings .TrimSpace (string (cudaPIDBytes ))
144143
145- // Get the command for this PID - it is _not_ always the root python process
146- data , err := exec .CommandContext (ctx , "ps" , "-o" , "cmd=" , cudaPID ).Output ()
147- if err != nil {
148- return err
149- }
150-
151- cudaCmd := strings .TrimSpace (string (data ))
152-
153- // Write said command to a file for later
154- err = os .WriteFile (filepath .Join (c .checkpointDir , cudaCmdFileName ), []byte (cudaCmd ), 0o644 )
155- if err != nil {
156- return err
157- }
158-
159144 // Toggle CUDA off
160145 cmd := exec .CommandContext (ctx , cudaCheckpointPath , "--toggle" , "--pid" , cudaPID )
161146 if err := cmd .Run (); err != nil {
@@ -198,35 +183,18 @@ func (c *checkpointer) Restore(ctx context.Context) (*exec.Cmd, func(context.Con
198183 return nil , nil , nil
199184 }
200185
201- // Read process from sentinel file
202- cudaCmd , err := os .ReadFile (filepath .Join (c .checkpointDir , cudaCmdFileName ))
203- if err != nil {
204- return nil , nil , err
205- }
206-
207186 // Set up restore command
208187 restoreCmd := exec .CommandContext (ctx , criuPath , "restore" , "--tcp-close" , "--images-dir" , filepath .Join (c .checkpointDir , checkpointSubdirName ))
209188
210189 // Set up callback function once restore is started
211190 callback := func (con context.Context ) error {
212- // Get the PID for the command
213- cudaPID , err := exec .CommandContext (con , "pgrep" , "-fx" , string (cudaCmd )).Output ()
214- if err != nil {
215- c .log .Errorw ("failed to pgrep the CUDA command" , "error" , err )
216- // If this command failed, we want to best effort try to kill the started process,
217- // since we'll start a new one
218- restoreCmd .Process .Kill () //nolint:errcheck // This is just best effort
219-
220- return err
221- }
222-
223191 // Toggle CUDA on for the restored process
224- cmd := exec .CommandContext (con , cudaCheckpointPath , "--toggle" , "--pid" , string ( cudaPID ))
192+ cmd := exec .CommandContext (con , cudaCheckpointPath , "--toggle" , "--pid" , strconv . Itoa ( restoreCmd . Process . Pid ))
225193 if err := cmd .Run (); err != nil {
226194 c .log .Errorw ("failed to toggle CUDA on" , "error" , err )
227195 // If this command failed, we want to best effort try to kill the started process,
228196 // since we'll start a new one
229- restoreCmd . Process . Kill ( ) //nolint:errcheck // This is just best effort
197+ killProcess ( restoreCmd ) //nolint:errcheck // This is just best effort
230198
231199 return err
232200 }
@@ -238,6 +206,24 @@ func (c *checkpointer) Restore(ctx context.Context) (*exec.Cmd, func(context.Con
238206 return restoreCmd , callback , nil
239207}
240208
209+ func killProcess (cmd * exec.Cmd ) error {
210+ err := cmd .Process .Kill ()
211+ if err != nil {
212+ return err
213+ }
214+
215+ // Wait for the process to exit with a 5 second timeout
216+ done := make (chan error , 1 )
217+ go func () { done <- cmd .Wait () }()
218+
219+ select {
220+ case err = <- done :
221+ return err
222+ case <- time .After (5 * time .Second ):
223+ return nil
224+ }
225+ }
226+
241227func (c * checkpointer ) WriteReadyFile () error {
242228 // If it isn't expected, make this a no-op
243229 if os .Getenv (shouldCheckpointEnvVar ) != "true" {
0 commit comments