@@ -26,6 +26,7 @@ import (
26
26
apierrors "k8s.io/apimachinery/pkg/api/errors"
27
27
"k8s.io/apimachinery/pkg/api/meta"
28
28
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
29
+ "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
29
30
"k8s.io/apimachinery/pkg/runtime"
30
31
"k8s.io/apimachinery/pkg/types"
31
32
@@ -66,6 +67,7 @@ type podStatusSummary struct {
66
67
type componentStatusSummary struct {
67
68
expected int32
68
69
deployed int32
70
+ failed int32
69
71
}
70
72
71
73
// permission to fully control appwrappers
@@ -240,6 +242,18 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
240
242
return r .updateStatus (ctx , aw , workloadv1beta2 .AppWrapperFailed )
241
243
}
242
244
245
+ // If a component's controller has put it into a failed state, we do not need
246
+ // to allow any further grace period. The situation will not self-correct.
247
+ if compStatus .failed > 0 {
248
+ meta .SetStatusCondition (& aw .Status .Conditions , metav1.Condition {
249
+ Type : string (workloadv1beta2 .Unhealthy ),
250
+ Status : metav1 .ConditionTrue ,
251
+ Reason : "FailedComponent" ,
252
+ Message : fmt .Sprintf ("Found %v failed components" , compStatus .failed ),
253
+ })
254
+ return r .resetOrFail (ctx , aw )
255
+ }
256
+
243
257
// Second, check the Pod-level status of the workload
244
258
podStatus , err := r .getPodStatus (ctx , aw )
245
259
if err != nil {
@@ -501,20 +515,66 @@ func (r *AppWrapperReconciler) getComponentStatus(ctx context.Context, aw *workl
501
515
502
516
for componentIdx := range aw .Status .ComponentStatus {
503
517
cs := & aw .Status .ComponentStatus [componentIdx ]
504
- obj := & metav1.PartialObjectMetadata {TypeMeta : metav1.TypeMeta {Kind : cs .Kind , APIVersion : cs .APIVersion }}
505
- if err := r .Get (ctx , types.NamespacedName {Name : cs .Name , Namespace : aw .Namespace }, obj ); err == nil {
506
- if obj .DeletionTimestamp .IsZero () {
507
- summary .deployed += 1
518
+ switch cs .Kind {
519
+ case "PyTorchJob" :
520
+ obj := & unstructured.Unstructured {}
521
+ obj .SetAPIVersion (cs .APIVersion )
522
+ obj .SetKind (cs .Kind )
523
+ if err := r .Get (ctx , types.NamespacedName {Name : cs .Name , Namespace : aw .Namespace }, obj ); err == nil {
524
+ if obj .GetDeletionTimestamp ().IsZero () {
525
+ summary .deployed += 1
526
+
527
+ // PyTorchJob is failed if status.Conditions contains an entry with type "Failed" and status "True"
528
+ status , ok := obj .UnstructuredContent ()["status" ]
529
+ if ! ok {
530
+ continue
531
+ }
532
+ cond , ok := status .(map [string ]interface {})["conditions" ]
533
+ if ! ok {
534
+ continue
535
+ }
536
+ condArray , ok := cond .([]interface {})
537
+ if ! ok {
538
+ continue
539
+ }
540
+ for _ , aCond := range condArray {
541
+ if condMap , ok := aCond .(map [string ]interface {}); ok {
542
+ if condType , ok := condMap ["type" ]; ok && condType .(string ) == "Failed" {
543
+ if status , ok := condMap ["status" ]; ok && status .(string ) == "True" {
544
+ summary .failed += 1
545
+ }
546
+ }
547
+ }
548
+ }
549
+ }
550
+ } else {
551
+ if apierrors .IsNotFound (err ) {
552
+ meta .SetStatusCondition (& aw .Status .ComponentStatus [componentIdx ].Conditions , metav1.Condition {
553
+ Type : string (workloadv1beta2 .Unhealthy ),
554
+ Status : metav1 .ConditionTrue ,
555
+ Reason : "ComponentNotFound" ,
556
+ })
557
+ } else {
558
+ return nil , err
559
+ }
508
560
}
509
- } else {
510
- if apierrors . IsNotFound ( err ) {
511
- meta . SetStatusCondition ( & aw . Status . ComponentStatus [ componentIdx ]. Conditions , metav1. Condition {
512
- Type : string ( workloadv1beta2 . Unhealthy ),
513
- Status : metav1 . ConditionTrue ,
514
- Reason : "ComponentNotFound" ,
515
- })
561
+
562
+ default :
563
+ obj := & metav1. PartialObjectMetadata { TypeMeta : metav1. TypeMeta { Kind : cs . Kind , APIVersion : cs . APIVersion }}
564
+ if err := r . Get ( ctx , types. NamespacedName { Name : cs . Name , Namespace : aw . Namespace }, obj ); err == nil {
565
+ if obj . GetDeletionTimestamp (). IsZero () {
566
+ summary . deployed += 1
567
+ }
516
568
} else {
517
- return nil , err
569
+ if apierrors .IsNotFound (err ) {
570
+ meta .SetStatusCondition (& aw .Status .ComponentStatus [componentIdx ].Conditions , metav1.Condition {
571
+ Type : string (workloadv1beta2 .Unhealthy ),
572
+ Status : metav1 .ConditionTrue ,
573
+ Reason : "ComponentNotFound" ,
574
+ })
575
+ } else {
576
+ return nil , err
577
+ }
518
578
}
519
579
}
520
580
}
0 commit comments