-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Description
What happened?
version 1.1.25
If you delete an app.yaml file and then immediately apply it with another app.yaml, the app might get stuck in the submitted state.
The initial analysis suggests that the Spark operator hadn't yet deleted the old driver pod, and the new submit operation detected the existence of a driver pod with the same name.
Reproduction Code
func runSparkSubmit(submission *submission) (bool, error) {
sparkHome, present := os.LookupEnv(sparkHomeEnvVar)
if !present {
log.Error("SPARK_HOME is not specified")
}
var command = filepath.Join(sparkHome, "/bin/spark-submit")
cmd := execCommand(command, submission.args...)
log.Infof("spark-submit arguments: %v", cmd.Args)
output, err := cmd.Output()
log.Infof("spark-submit output: %s", string(output))
if err != nil {
var errorMsg string
if exitErr, ok := err.(*exec.ExitError); ok {
errorMsg = string(exitErr.Stderr)
}
// The driver pod of the application already exists.
if strings.Contains(errorMsg, podAlreadyExistsErrorCode) {
log.Warnf("trying to resubmit an already submitted SparkApplication %s/%s", submission.namespace, submission.name)
return false, nil
}
if errorMsg != "" {
return false, fmt.Errorf("failed to run spark-submit for SparkApplication %s/%s: %s", submission.namespace, submission.name, errorMsg)
}
return false, fmt.Errorf("failed to run spark-submit for SparkApplication %s/%s: %v", submission.namespace, submission.name, err)
}
return true, nil
}
submitted, err := runSparkSubmit(newSubmission(submissionCmdArgs, app))
if err != nil {
app.Status = v1beta2.SparkApplicationStatus{
AppState: v1beta2.ApplicationState{
State: v1beta2.FailedSubmissionState,
ErrorMessage: err.Error(),
},
SubmissionAttempts: app.Status.SubmissionAttempts + 1,
LastSubmissionAttemptTime: metav1.Now(),
}
c.recordSparkApplicationEvent(app)
log.Errorf("failed to run spark-submit for SparkApplication %s/%s: %v", app.Namespace, app.Name, err)
return app
}
if !submitted {
// The application may not have been submitted even if err == nil, e.g., when some
// state update caused an attempt to re-submit the application, in which case no
// error gets returned from runSparkSubmit. If this is the case, we simply return.
return app
}
Expected behavior
No response
Actual behavior
No response
Environment & Versions
- Kubernetes Version:
- Spark Operator Version:
- Apache Spark Version:
Additional context
No response
Impacted by this bug?
Give it a 👍 We prioritize the issues with most 👍