Skip to content

Commit f85fd2e

Browse files
committed
Remove stuff only needed for non-cog runtime
1 parent a1366c4 commit f85fd2e

File tree

1 file changed

+20
-34
lines changed

1 file changed

+20
-34
lines changed

internal/checkpointer/checkpointer.go

Lines changed: 20 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ const (
3434
criuPath = "/tmp/criu"
3535

3636
// Metadata storage paths
37-
cudaCmdFileName = "cuda-cmd"
3837
checkpointSubdirName = "checkpoint"
3938
)
4039

@@ -142,20 +141,6 @@ func (c *checkpointer) Checkpoint(ctx context.Context, cogletCmd *exec.Cmd, wait
142141

143142
cudaPID := strings.TrimSpace(string(cudaPIDBytes))
144143

145-
// Get the command for this PID - it is _not_ always the root python process
146-
data, err := exec.CommandContext(ctx, "ps", "-o", "cmd=", cudaPID).Output()
147-
if err != nil {
148-
return err
149-
}
150-
151-
cudaCmd := strings.TrimSpace(string(data))
152-
153-
// Write said command to a file for later
154-
err = os.WriteFile(filepath.Join(c.checkpointDir, cudaCmdFileName), []byte(cudaCmd), 0o644)
155-
if err != nil {
156-
return err
157-
}
158-
159144
// Toggle CUDA off
160145
cmd := exec.CommandContext(ctx, cudaCheckpointPath, "--toggle", "--pid", cudaPID)
161146
if err := cmd.Run(); err != nil {
@@ -198,35 +183,18 @@ func (c *checkpointer) Restore(ctx context.Context) (*exec.Cmd, func(context.Con
198183
return nil, nil, nil
199184
}
200185

201-
// Read process from sentinel file
202-
cudaCmd, err := os.ReadFile(filepath.Join(c.checkpointDir, cudaCmdFileName))
203-
if err != nil {
204-
return nil, nil, err
205-
}
206-
207186
// Set up restore command
208187
restoreCmd := exec.CommandContext(ctx, criuPath, "restore", "--tcp-close", "--images-dir", filepath.Join(c.checkpointDir, checkpointSubdirName))
209188

210189
// Set up callback function once restore is started
211190
callback := func(con context.Context) error {
212-
// Get the PID for the command
213-
cudaPID, err := exec.CommandContext(con, "pgrep", "-fx", string(cudaCmd)).Output()
214-
if err != nil {
215-
c.log.Errorw("failed to pgrep the CUDA command", "error", err)
216-
// If this command failed, we want to best effort try to kill the started process,
217-
// since we'll start a new one
218-
restoreCmd.Process.Kill() //nolint:errcheck // This is just best effort
219-
220-
return err
221-
}
222-
223191
// Toggle CUDA on for the restored process
224-
cmd := exec.CommandContext(con, cudaCheckpointPath, "--toggle", "--pid", string(cudaPID))
192+
cmd := exec.CommandContext(con, cudaCheckpointPath, "--toggle", "--pid", strconv.Itoa(restoreCmd.Process.Pid))
225193
if err := cmd.Run(); err != nil {
226194
c.log.Errorw("failed to toggle CUDA on", "error", err)
227195
// If this command failed, we want to best effort try to kill the started process,
228196
// since we'll start a new one
229-
restoreCmd.Process.Kill() //nolint:errcheck // This is just best effort
197+
killProcess(restoreCmd) //nolint:errcheck // This is just best effort
230198

231199
return err
232200
}
@@ -238,6 +206,24 @@ func (c *checkpointer) Restore(ctx context.Context) (*exec.Cmd, func(context.Con
238206
return restoreCmd, callback, nil
239207
}
240208

209+
func killProcess(cmd *exec.Cmd) error {
210+
err := cmd.Process.Kill()
211+
if err != nil {
212+
return err
213+
}
214+
215+
// Wait for the process to exit with a 5 second timeout
216+
done := make(chan error, 1)
217+
go func() { done <- cmd.Wait() }()
218+
219+
select {
220+
case err = <-done:
221+
return err
222+
case <-time.After(5 * time.Second):
223+
return nil
224+
}
225+
}
226+
241227
func (c *checkpointer) WriteReadyFile() error {
242228
// If it isn't expected, make this a no-op
243229
if os.Getenv(shouldCheckpointEnvVar) != "true" {

0 commit comments

Comments
 (0)