Skip to content

Commit 3d50830

Browse files
authored
Revert "Updates to the controller logic to better handle failures in etc updates (#424)"
This reverts commit 089cf9f.
1 parent 089cf9f commit 3d50830

File tree

20 files changed

+955
-1470
lines changed

20 files changed

+955
-1470
lines changed

config/crd/bases/mcad.ibm.com_appwrappers.yaml

Lines changed: 8 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
2+
---
13
apiVersion: apiextensions.k8s.io/v1
24
kind: CustomResourceDefinition
35
metadata:
@@ -776,10 +778,6 @@ spec:
776778
QueueJob (by Informer)
777779
format: date-time
778780
type: string
779-
controllerfirstdispatchtimestamp:
780-
description: Microsecond level timestamp when controller first dispatches appwrapper
781-
format: date-time
782-
type: string
783781
failed:
784782
description: The number of resources which reached phase Failed.
785783
format: int32
@@ -792,7 +790,8 @@ spec:
792790
description: Is Dispatched?
793791
type: boolean
794792
local:
795-
description: Indicate if message is a duplicate (for Informer to recognize duplicate messages)
793+
description: Indicate if message is a duplicate (for Informer to recognize
794+
duplicate messages)
796795
type: boolean
797796
message:
798797
type: string
@@ -813,13 +812,15 @@ spec:
813812
format: int32
814813
type: integer
815814
queuejobstate:
816-
description: State of QueueJob - Init, Queueing, HeadOfLine, Rejoining ...
815+
description: State of QueueJob - Init, Queueing, HeadOfLine, Rejoining,
816+
...
817817
type: string
818818
running:
819819
format: int32
820820
type: integer
821821
sender:
822-
description: Indicate sender of this message (extremely useful for debugging)
822+
description: Indicate sender of this message (extremely useful for
823+
debugging)
823824
type: string
824825
state:
825826
description: State - Pending, Running, Failed, Deleted
@@ -833,22 +834,9 @@ spec:
833834
(is this different from the MinAvailable from JobStatus)
834835
format: int32
835836
type: integer
836-
numberOfRequeueings:
837-
description: Field to keep track of how many times a requeuing event has been triggered
838-
format: int32
839-
type: integer
840-
default: 0
841-
requeueingTimeInSeconds:
842-
description: Field to keep track of total number of seconds spent in requeueing
843-
format: int32
844-
type: integer
845-
default: 0
846837
type: object
847838
required:
848839
- spec
849840
type: object
850841
served: true
851842
storage: true
852-
subresources:
853-
status: {}
854-

deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml

Lines changed: 8 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
2+
---
13
apiVersion: apiextensions.k8s.io/v1
24
kind: CustomResourceDefinition
35
metadata:
@@ -776,10 +778,6 @@ spec:
776778
QueueJob (by Informer)
777779
format: date-time
778780
type: string
779-
controllerfirstdispatchtimestamp:
780-
description: Microsecond level timestamp when controller first dispatches appwrapper
781-
format: date-time
782-
type: string
783781
failed:
784782
description: The number of resources which reached phase Failed.
785783
format: int32
@@ -792,7 +790,8 @@ spec:
792790
description: Is Dispatched?
793791
type: boolean
794792
local:
795-
description: Indicate if message is a duplicate (for Informer to recognize duplicate messages)
793+
description: Indicate if message is a duplicate (for Informer to recognize
794+
duplicate messages)
796795
type: boolean
797796
message:
798797
type: string
@@ -813,13 +812,15 @@ spec:
813812
format: int32
814813
type: integer
815814
queuejobstate:
816-
description: State of QueueJob - Init, Queueing, HeadOfLine, Rejoining ...
815+
description: State of QueueJob - Init, Queueing, HeadOfLine, Rejoining,
816+
...
817817
type: string
818818
running:
819819
format: int32
820820
type: integer
821821
sender:
822-
description: Indicate sender of this message (extremely useful for debugging)
822+
description: Indicate sender of this message (extremely useful for
823+
debugging)
823824
type: string
824825
state:
825826
description: State - Pending, Running, Failed, Deleted
@@ -833,22 +834,9 @@ spec:
833834
(is this different from the MinAvailable from JobStatus)
834835
format: int32
835836
type: integer
836-
numberOfRequeueings:
837-
description: Field to keep track of how many times a requeuing event has been triggered
838-
format: int32
839-
type: integer
840-
default: 0
841-
requeueingTimeInSeconds:
842-
description: Field to keep track of total number of seconds spent in requeueing
843-
format: int32
844-
type: integer
845-
default: 0
846837
type: object
847838
required:
848839
- spec
849840
type: object
850841
served: true
851842
storage: true
852-
subresources:
853-
status: {}
854-

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ go 1.18
55
require (
66
github.com/eapache/go-resiliency v1.3.0
77
github.com/emicklei/go-restful v2.16.0+incompatible
8-
github.com/gogo/protobuf v1.3.1
98
github.com/golang/protobuf v1.4.3
109
github.com/hashicorp/go-multierror v1.1.1
1110
github.com/kubernetes-sigs/custom-metrics-apiserver v0.0.0-20210311094424-0ca2b1909cdc
@@ -46,6 +45,7 @@ require (
4645
github.com/go-openapi/jsonreference v0.19.5 // indirect
4746
github.com/go-openapi/spec v0.20.0 // indirect
4847
github.com/go-openapi/swag v0.19.12 // indirect
48+
github.com/gogo/protobuf v1.3.1 // indirect
4949
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect
5050
github.com/google/go-cmp v0.5.5 // indirect
5151
github.com/google/gofuzz v1.1.0 // indirect

hack/run-e2e-kind.sh

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ export CLUSTER_CONTEXT="--name test"
3434
export IMAGE_ECHOSERVER="kicbase/echo-server:1.0"
3535
export IMAGE_UBUNTU_LATEST="ubuntu:latest"
3636
export IMAGE_UBI_LATEST="registry.access.redhat.com/ubi8/ubi:latest"
37-
export IMAGE_BUSY_BOX_LATEST="k8s.gcr.io/busybox:latest"
3837
export KIND_OPT=${KIND_OPT:=" --config ${ROOT_DIR}/hack/e2e-kind-config.yaml"}
3938
export KA_BIN=_output/bin
4039
export WAIT_TIME="20s"
@@ -208,20 +207,27 @@ function kind-up-cluster {
208207
exit 1
209208
fi
210209

211-
docker pull ${IMAGE_UBI_LATEST}
210+
docker pull ${IMAGE_ECHOSERVER}
212211
if [ $? -ne 0 ]
213212
then
214-
echo "Failed to pull ${IMAGE_UBI_LATEST}"
213+
echo "Failed to pull ${IMAGE_ECHOSERVER}"
215214
exit 1
216215
fi
217-
218-
docker pull ${IMAGE_BUSY_BOX_LATEST}
216+
217+
docker pull ${IMAGE_UBUNTU_LATEST}
219218
if [ $? -ne 0 ]
220219
then
221-
echo "Failed to pull ${IMAGE_BUSY_BOX_LATEST}"
220+
echo "Failed to pull ${IMAGE_UBUNTU_LATEST}"
222221
exit 1
223222
fi
224-
223+
224+
docker pull ${IMAGE_UBI_LATEST}
225+
if [ $? -ne 0 ]
226+
then
227+
echo "Failed to pull ${IMAGE_UBI_LATEST}"
228+
exit 1
229+
fi
230+
225231
if [[ "$MCAD_IMAGE_PULL_POLICY" = "Always" ]]
226232
then
227233
docker pull ${IMAGE_MCAD}
@@ -238,7 +244,7 @@ function kind-up-cluster {
238244
fi
239245
docker images
240246

241-
for image in ${IMAGE_ECHOSERVER} ${IMAGE_UBUNTU_LATEST} ${IMAGE_MCAD} ${IMAGE_UBI_LATEST} ${IMAGE_BUSY_BOX_LATEST}
247+
for image in ${IMAGE_ECHOSERVER} ${IMAGE_UBUNTU_LATEST} ${IMAGE_MCAD} ${IMAGE_UBI_LATEST}
242248
do
243249
kind load docker-image ${image} ${CLUSTER_CONTEXT}
244250
if [ $? -ne 0 ]
@@ -324,6 +330,8 @@ function mcad-quota-management-down {
324330
echo "Failed to undeploy controller"
325331
exit 1
326332
fi
333+
echo "Waiting for the test namespace to be cleaned up.."
334+
sleep 60
327335
}
328336

329337
function mcad-up {
@@ -394,4 +402,4 @@ setup-mcad-env
394402
kuttl-tests
395403
mcad-quota-management-down
396404
mcad-up
397-
go test ./test/e2e -v -timeout 130m -count=1
405+
go test ./test/e2e -v -timeout 120m -count=1

pkg/apis/controller/v1beta1/appwrapper.go

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ type AppWrapperService struct {
101101
}
102102

103103
// AppWrapperResource is App Wrapper aggregation resource
104-
// todo: To be depricated
104+
//todo: To be depricated
105105
type AppWrapperResource struct {
106106
metav1.TypeMeta `json:",inline"`
107107
metav1.ObjectMeta `json:"metadata"`
@@ -246,7 +246,7 @@ type AppWrapperStatus struct {
246246
// Microsecond level timestamp when controller first sees QueueJob (by Informer)
247247
ControllerFirstTimestamp metav1.MicroTime `json:"controllerfirsttimestamp,omitempty"`
248248

249-
// Microsecond level timestamp when controller first dispatches appwrapper
249+
// Microsecond level timestamp when controller first sets appwrapper in state Running
250250
ControllerFirstDispatchTimestamp metav1.MicroTime `json:"controllerfirstdispatchtimestamp,omitempty"`
251251

252252
// Tell Informer to ignore this update message (do not generate a controller event)
@@ -264,25 +264,18 @@ type AppWrapperStatus struct {
264264
// Represents the latest available observations of pods under appwrapper
265265
PendingPodConditions []PendingPodSpec `json:"pendingpodconditions"`
266266

267-
// Represents the number of cpu consumed by all pods belonging to an appwrapper.
267+
//Resources consumed
268+
268269
TotalCPU float64 `json:"totalcpu,omitempty"`
269270

270-
// Represents the amount of memory consumed by all pods belonging to an appwrapper.
271271
TotalMemory float64 `json:"totalmemory,omitempty"`
272272

273-
// Represents the total number of GPUs consumed by all pods belonging to an appwrapper.
274273
TotalGPU int64 `json:"totalgpu,omitempty"`
275-
276-
// Field to keep track of total number of seconds spent in requeueing
277-
RequeueingTimeInSeconds int `json:"requeueingTimeInSeconds,omitempty"`
278-
279-
// Field to keep track of how many times a requeuing event has been triggered
280-
NumberOfRequeueings int `json:"numberOfRequeueings,omitempty"`
281274
}
282275

283276
type AppWrapperState string
284277

285-
// enqueued, active, deleting, succeeded, failed
278+
//enqueued, active, deleting, succeeded, failed
286279
const (
287280
AppWrapperStateEnqueued AppWrapperState = "Pending"
288281
AppWrapperStateActive AppWrapperState = "Running"

pkg/controller/clusterstate/cache/cache.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,7 @@ func (sc *ClusterStateCache) updateState() error {
334334
}
335335

336336
func (sc *ClusterStateCache) deleteJob(job *api.JobInfo) {
337-
klog.V(10).Infof("[deleteJob] Attempting to delete Job <%v:%v/%v>", job.UID, job.Namespace, job.Name)
337+
klog.V(4).Infof("[deleteJob] Attempting to delete Job <%v:%v/%v>", job.UID, job.Namespace, job.Name)
338338

339339
time.AfterFunc(5*time.Second, func() {
340340
sc.deletedJobs.AddIfNotPresent(job)

pkg/controller/metrics/adapter/adapter.go

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -32,40 +32,39 @@ package adapter
3232

3333
import (
3434
"flag"
35-
"net/http"
36-
"os"
37-
3835
"github.com/project-codeflare/multi-cluster-app-dispatcher/cmd/kar-controllers/app/options"
3936
openapinamer "k8s.io/apiserver/pkg/endpoints/openapi"
4037
genericapiserver "k8s.io/apiserver/pkg/server"
38+
"net/http"
39+
"os"
4140

4241
"github.com/emicklei/go-restful"
4342
"k8s.io/apimachinery/pkg/util/wait"
4443
"k8s.io/client-go/rest"
4544
"k8s.io/klog/v2"
4645

46+
adapterprov "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/metrics/adapter/provider"
4747
"github.com/kubernetes-sigs/custom-metrics-apiserver/pkg/apiserver"
4848
basecmd "github.com/kubernetes-sigs/custom-metrics-apiserver/pkg/cmd"
4949
"github.com/kubernetes-sigs/custom-metrics-apiserver/pkg/provider"
5050
generatedopenapi "github.com/kubernetes-sigs/custom-metrics-apiserver/test-adapter/generated/openapi"
51-
adapterprov "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/metrics/adapter/provider"
5251

5352
clusterstatecache "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/cache"
5453
)
5554

5655
// New returns a Cache implementation.
57-
func New(serverOptions *options.ServerOption, config *rest.Config, clusterStateCache clusterstatecache.Cache) *MetricsAdapter {
56+
func New(serverOptions *options.ServerOption, config *rest.Config, clusterStateCache clusterstatecache.Cache) *MetricsAdpater {
5857
return newMetricsAdpater(serverOptions, config, clusterStateCache)
5958
}
6059

61-
type MetricsAdapter struct {
60+
type MetricsAdpater struct {
6261
basecmd.AdapterBase
6362

6463
// Message is printed on succesful startup
6564
Message string
6665
}
6766

68-
func (a *MetricsAdapter) makeProviderOrDie(clusterStateCache clusterstatecache.Cache) (provider.MetricsProvider, *restful.WebService) {
67+
func (a *MetricsAdpater) makeProviderOrDie(clusterStateCache clusterstatecache.Cache) (provider.MetricsProvider, *restful.WebService) {
6968
klog.Infof("[makeProviderOrDie] Entered makeProviderOrDie()")
7069
client, err := a.DynamicClient()
7170
if err != nil {
@@ -80,7 +79,7 @@ func (a *MetricsAdapter) makeProviderOrDie(clusterStateCache clusterstatecache.C
8079
return adapterprov.NewFakeProvider(client, mapper, clusterStateCache)
8180
}
8281

83-
func covertServerOptionsToMetricsServerOptions(serverOptions *options.ServerOption) []string {
82+
func covertServerOptionsToMetricsServerOptions(serverOptions *options.ServerOption) []string{
8483
var portedArgs = make([]string, 0)
8584
if serverOptions == nil {
8685
return portedArgs
@@ -92,10 +91,11 @@ func covertServerOptionsToMetricsServerOptions(serverOptions *options.ServerOpti
9291
}
9392
return portedArgs
9493
}
95-
func newMetricsAdpater(serverOptions *options.ServerOption, config *rest.Config, clusterStateCache clusterstatecache.Cache) *MetricsAdapter {
94+
func newMetricsAdpater(serverOptions *options.ServerOption, config *rest.Config, clusterStateCache clusterstatecache.Cache) *MetricsAdpater {
9695
klog.V(10).Infof("[newMetricsAdpater] Entered newMetricsAdpater()")
9796

98-
cmd := &MetricsAdapter{}
97+
cmd := &MetricsAdpater{
98+
}
9999

100100
cmd.OpenAPIConfig = genericapiserver.DefaultOpenAPIConfig(generatedopenapi.GetOpenAPIDefinitions, openapinamer.NewDefinitionNamer(apiserver.Scheme))
101101
cmd.OpenAPIConfig.Info.Title = "MetricsAdpater"
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
package queuejob
2+
3+
import (
4+
"strings"
5+
"sync"
6+
7+
arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1"
8+
)
9+
10+
// ActiveAppWrapper is current scheduling AppWrapper in the XController struct.
11+
// Its sole purpose is provide a thread safe way to for use the XController logic
12+
type ActiveAppWrapper struct {
13+
activeAW *arbv1.AppWrapper
14+
activeAWMutex *sync.RWMutex
15+
}
16+
17+
// NewActiveAppWrapper
18+
func NewActiveAppWrapper() *ActiveAppWrapper {
19+
return &ActiveAppWrapper{
20+
activeAW: nil,
21+
activeAWMutex: &sync.RWMutex{},
22+
}
23+
}
24+
25+
// AtomicSet as is name implies, atomically sets the activeAW to the new value
26+
func (aw *ActiveAppWrapper) AtomicSet(newValue *arbv1.AppWrapper) {
27+
aw.activeAWMutex.Lock()
28+
defer aw.activeAWMutex.Unlock()
29+
aw.activeAW = newValue
30+
}
31+
32+
// IsActiveAppWrapper safely performs the comparison that was done inside the if block
33+
// at line 1977 in the queuejob_controller_ex.go
34+
// The code looked like this:
35+
//
36+
// if !qj.Status.CanRun && qj.Status.State == arbv1.AppWrapperStateEnqueued &&
37+
// !cc.qjqueue.IfExistUnschedulableQ(qj) && !cc.qjqueue.IfExistActiveQ(qj) {
38+
// // One more check to ensure AW is not the current active schedule object
39+
// if cc.schedulingAW == nil ||
40+
// (strings.Compare(cc.schedulingAW.Namespace, qj.Namespace) != 0 &&
41+
// strings.Compare(cc.schedulingAW.Name, qj.Name) != 0) {
42+
// cc.qjqueue.AddIfNotPresent(qj)
43+
// klog.V(3).Infof("[manageQueueJob] Recovered AppWrapper %s%s - added to active queue, Status=%+v",
44+
// qj.Namespace, qj.Name, qj.Status)
45+
// return nil
46+
// }
47+
// }
48+
func (aw *ActiveAppWrapper) IsActiveAppWrapper(name, namespace string) bool {
49+
aw.activeAWMutex.RLock()
50+
defer aw.activeAWMutex.RUnlock()
51+
return aw.activeAW == nil ||
52+
(strings.Compare(aw.activeAW.Namespace, namespace) != 0 &&
53+
strings.Compare(aw.activeAW.Name, name) != 0)
54+
}

0 commit comments

Comments
 (0)