Skip to content

Commit 4dd6b07

Browse files
authored
Recognize component-level failure for batch/v1 Jobs (#165)
1 parent 4b37c1b commit 4dd6b07

File tree

2 files changed

+26
-21
lines changed

2 files changed

+26
-21
lines changed

internal/controller/appwrapper/appwrapper_controller.go

Lines changed: 24 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"strconv"
2323
"time"
2424

25+
batchv1 "k8s.io/api/batch/v1"
2526
v1 "k8s.io/api/core/v1"
2627
apierrors "k8s.io/apimachinery/pkg/api/errors"
2728
"k8s.io/apimachinery/pkg/api/meta"
@@ -515,8 +516,8 @@ func (r *AppWrapperReconciler) getComponentStatus(ctx context.Context, aw *workl
515516

516517
for componentIdx := range aw.Status.ComponentStatus {
517518
cs := &aw.Status.ComponentStatus[componentIdx]
518-
switch cs.Kind {
519-
case "PyTorchJob":
519+
switch cs.APIVersion + ":" + cs.Kind {
520+
case "kubeflow.org/v1:PyTorchJob":
520521
obj := &unstructured.Unstructured{}
521522
obj.SetAPIVersion(cs.APIVersion)
522523
obj.SetKind(cs.Kind)
@@ -547,16 +548,26 @@ func (r *AppWrapperReconciler) getComponentStatus(ctx context.Context, aw *workl
547548
}
548549
}
549550
}
550-
} else {
551-
if apierrors.IsNotFound(err) {
552-
meta.SetStatusCondition(&aw.Status.ComponentStatus[componentIdx].Conditions, metav1.Condition{
553-
Type: string(workloadv1beta2.Unhealthy),
554-
Status: metav1.ConditionTrue,
555-
Reason: "ComponentNotFound",
556-
})
557-
} else {
558-
return nil, err
551+
} else if !apierrors.IsNotFound(err) {
552+
return nil, err
553+
}
554+
555+
case "batch/v1:Job":
556+
obj := &batchv1.Job{}
557+
if err := r.Get(ctx, types.NamespacedName{Name: cs.Name, Namespace: aw.Namespace}, obj); err == nil {
558+
if obj.GetDeletionTimestamp().IsZero() {
559+
summary.deployed += 1
560+
561+
// batch/v1 Jobs are failed when status.Conditions contains an entry with type "Failed" and status "True"
562+
for _, jc := range obj.Status.Conditions {
563+
if jc.Type == batchv1.JobFailed && jc.Status == v1.ConditionTrue {
564+
summary.failed += 1
565+
}
566+
}
559567
}
568+
569+
} else if !apierrors.IsNotFound(err) {
570+
return nil, err
560571
}
561572

562573
default:
@@ -565,16 +576,8 @@ func (r *AppWrapperReconciler) getComponentStatus(ctx context.Context, aw *workl
565576
if obj.GetDeletionTimestamp().IsZero() {
566577
summary.deployed += 1
567578
}
568-
} else {
569-
if apierrors.IsNotFound(err) {
570-
meta.SetStatusCondition(&aw.Status.ComponentStatus[componentIdx].Conditions, metav1.Condition{
571-
Type: string(workloadv1beta2.Unhealthy),
572-
Status: metav1.ConditionTrue,
573-
Reason: "ComponentNotFound",
574-
})
575-
} else {
576-
return nil, err
577-
}
579+
} else if !apierrors.IsNotFound(err) {
580+
return nil, err
578581
}
579582
}
580583
}

samples/wrapped-failing-job.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ spec:
1616
metadata:
1717
name: sample-failing-job
1818
spec:
19+
backoffLimit: 1
20+
completions: 1
1921
template:
2022
spec:
2123
restartPolicy: Never

0 commit comments

Comments
 (0)