From ac1297dff12b44d2212fb14cf3f581565ca6a6f8 Mon Sep 17 00:00:00 2001 From: Abhishek Malvankar Date: Mon, 15 May 2023 20:16:22 -0400 Subject: [PATCH 01/10] force release quota when AW does not exists --- .../quota-manager/quota/quotamanager.go | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/pkg/quotaplugins/quota-forest/quota-manager/quota/quotamanager.go b/pkg/quotaplugins/quota-forest/quota-manager/quota/quotamanager.go index a5f5ee8b4..21a1c4cc8 100644 --- a/pkg/quotaplugins/quota-forest/quota-manager/quota/quotamanager.go +++ b/pkg/quotaplugins/quota-forest/quota-manager/quota/quotamanager.go @@ -19,11 +19,15 @@ package quota import ( "bytes" "fmt" + "regexp" "strings" "sync" + clientset "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned" "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/quotaplugins/quota-forest/quota-manager/quota/core" "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/quotaplugins/quota-forest/quota-manager/quota/utils" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/rest" "k8s.io/klog/v2" ) @@ -434,7 +438,24 @@ func (m *Manager) AllocateForest(forestName string, consumerID string) (response return nil, fmt.Errorf("consumer %s does not exist, create and add first", consumerID) } if forestController.IsConsumerAllocated(consumerID) { - return nil, fmt.Errorf("consumer %s already allocated on forest %s", consumerID, forestName) + //allocation can pass if consumers are not released + //make sure that appwrapper (consumer) does not exist in target namespace + re := regexp.MustCompile("_") + split := re.Split(consumerID, -1) + namespace := split[1] + awName := split[3] + config, err := rest.InClusterConfig() + if err != nil { + klog.Errorf("unable to retrieve in-cluster kubeconfig %v", err) + } + arbclients := clientset.NewForConfigOrDie(config) + _, err = arbclients.ArbV1().AppWrappers(namespace).Get(awName, v1.GetOptions{}) + if err != nil { + return nil, fmt.Errorf("consumer %s already allocated on forest %s", consumerID, forestName) + } else { + //release quota + m.DeAllocateForest(forestName, consumerID) + } } resourceNames := forestController.GetResourceNames() forestConsumer, err := consumerInfo.CreateForestConsumer(forestName, resourceNames) From 9f2950c02a2baafa595440a9f533448ead6f1099 Mon Sep 17 00:00:00 2001 From: Abhishek Malvankar Date: Tue, 16 May 2023 15:28:57 -0400 Subject: [PATCH 02/10] resolve review --- .../quota-manager/quota/quotamanager.go | 39 +++++++++++-------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/pkg/quotaplugins/quota-forest/quota-manager/quota/quotamanager.go b/pkg/quotaplugins/quota-forest/quota-manager/quota/quotamanager.go index 21a1c4cc8..a1652703b 100644 --- a/pkg/quotaplugins/quota-forest/quota-manager/quota/quotamanager.go +++ b/pkg/quotaplugins/quota-forest/quota-manager/quota/quotamanager.go @@ -430,6 +430,7 @@ func (m *Manager) AllocateForest(forestName string, consumerID string) (response defer m.mutex.Unlock() forestController := m.forests[forestName] + var awExistsInNamespace bool = false if forestController == nil { return nil, fmt.Errorf("invalid forest name %s", forestName) } @@ -437,26 +438,32 @@ func (m *Manager) AllocateForest(forestName string, consumerID string) (response if consumerInfo == nil { return nil, fmt.Errorf("consumer %s does not exist, create and add first", consumerID) } - if forestController.IsConsumerAllocated(consumerID) { - //allocation can pass if consumers are not released - //make sure that appwrapper (consumer) does not exist in target namespace - re := regexp.MustCompile("_") - split := re.Split(consumerID, -1) - namespace := split[1] - awName := split[3] - config, err := rest.InClusterConfig() - if err != nil { - klog.Errorf("unable to retrieve in-cluster kubeconfig %v", err) - } - arbclients := clientset.NewForConfigOrDie(config) - _, err = arbclients.ArbV1().AppWrappers(namespace).Get(awName, v1.GetOptions{}) - if err != nil { + //allocation can pass if consumers are not released + //make sure that appwrapper (consumer) does not exist in target namespace + re := regexp.MustCompile("_") + split := re.Split(consumerID, -1) + namespace := split[1] + awName := split[3] + config, err := rest.InClusterConfig() + if err != nil { + klog.Errorf("unable to retrieve in-cluster kubeconfig %v", err) + } + arbclients := clientset.NewForConfigOrDie(config) + awQueried, err := arbclients.ArbV1().AppWrappers(namespace).Get(awName, v1.GetOptions{}) + if awQueried.Name == awName { + awExistsInNamespace = true + klog.V(5).Infof("Found consumer %v in namespace", consumerID) + } + if awExistsInNamespace { + if forestController.IsConsumerAllocated(consumerID) { return nil, fmt.Errorf("consumer %s already allocated on forest %s", consumerID, forestName) - } else { - //release quota + } + } else { + if forestController.IsConsumerAllocated(consumerID) { m.DeAllocateForest(forestName, consumerID) } } + resourceNames := forestController.GetResourceNames() forestConsumer, err := consumerInfo.CreateForestConsumer(forestName, resourceNames) if err != nil { From 3d9b39a797702337b996b5c2a04834b6aca1f40a Mon Sep 17 00:00:00 2001 From: Abhishek Malvankar Date: Tue, 16 May 2023 17:13:11 -0400 Subject: [PATCH 03/10] address review-2 --- .../quota-manager/quota/quotamanager.go | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/pkg/quotaplugins/quota-forest/quota-manager/quota/quotamanager.go b/pkg/quotaplugins/quota-forest/quota-manager/quota/quotamanager.go index a1652703b..48edad3e9 100644 --- a/pkg/quotaplugins/quota-forest/quota-manager/quota/quotamanager.go +++ b/pkg/quotaplugins/quota-forest/quota-manager/quota/quotamanager.go @@ -19,13 +19,13 @@ package quota import ( "bytes" "fmt" - "regexp" "strings" "sync" clientset "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned" "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/quotaplugins/quota-forest/quota-manager/quota/core" "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/quotaplugins/quota-forest/quota-manager/quota/utils" + "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/quotaplugins/util" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/rest" "k8s.io/klog/v2" @@ -430,7 +430,6 @@ func (m *Manager) AllocateForest(forestName string, consumerID string) (response defer m.mutex.Unlock() forestController := m.forests[forestName] - var awExistsInNamespace bool = false if forestController == nil { return nil, fmt.Errorf("invalid forest name %s", forestName) } @@ -438,21 +437,24 @@ func (m *Manager) AllocateForest(forestName string, consumerID string) (response if consumerInfo == nil { return nil, fmt.Errorf("consumer %s does not exist, create and add first", consumerID) } - //allocation can pass if consumers are not released + //allocation can pass if consumers inside the forest is not released //make sure that appwrapper (consumer) does not exist in target namespace - re := regexp.MustCompile("_") - split := re.Split(consumerID, -1) - namespace := split[1] - awName := split[3] + namespace, awName := util.ParseId(consumerID) config, err := rest.InClusterConfig() if err != nil { klog.Errorf("unable to retrieve in-cluster kubeconfig %v", err) } arbclients := clientset.NewForConfigOrDie(config) awQueried, err := arbclients.ArbV1().AppWrappers(namespace).Get(awName, v1.GetOptions{}) + var awExistsInNamespace bool = false + if err != nil { + klog.Errorf("Unable to query consumer %v,", consumerID) + //force set to true + awExistsInNamespace = true + } if awQueried.Name == awName { awExistsInNamespace = true - klog.V(5).Infof("Found consumer %v in namespace", consumerID) + klog.V(5).Infof("[AllocateForest] Found consumer %v in namespace", consumerID) } if awExistsInNamespace { if forestController.IsConsumerAllocated(consumerID) { From cd2eb7efa9120c82c6192efc29c631b25c5c8e54 Mon Sep 17 00:00:00 2001 From: Abhishek Malvankar Date: Wed, 17 May 2023 11:08:03 -0400 Subject: [PATCH 04/10] address review 3 --- .../quota-forest/quota-manager/quota/quotamanager.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/quotaplugins/quota-forest/quota-manager/quota/quotamanager.go b/pkg/quotaplugins/quota-forest/quota-manager/quota/quotamanager.go index 48edad3e9..fc0c2699b 100644 --- a/pkg/quotaplugins/quota-forest/quota-manager/quota/quotamanager.go +++ b/pkg/quotaplugins/quota-forest/quota-manager/quota/quotamanager.go @@ -437,7 +437,7 @@ func (m *Manager) AllocateForest(forestName string, consumerID string) (response if consumerInfo == nil { return nil, fmt.Errorf("consumer %s does not exist, create and add first", consumerID) } - //allocation can pass if consumers inside the forest is not released + //allocation can pass if consumers inside the forest are not released //make sure that appwrapper (consumer) does not exist in target namespace namespace, awName := util.ParseId(consumerID) config, err := rest.InClusterConfig() @@ -462,6 +462,7 @@ func (m *Manager) AllocateForest(forestName string, consumerID string) (response } } else { if forestController.IsConsumerAllocated(consumerID) { + klog.V(4).Infof("[AllocateForest] Found consumer %v in quota tree but didn't find its matching AppWrapper. Deleting consumer from the tree", consumerID) m.DeAllocateForest(forestName, consumerID) } } From 60f8f0025f3dd9d5bba09a57b2ce71cc192db41d Mon Sep 17 00:00:00 2001 From: Abhishek Malvankar Date: Fri, 19 May 2023 18:59:20 -0400 Subject: [PATCH 05/10] move release code up in call stack --- .../queuejob/queuejob_controller_ex.go | 6 +++- .../quota-manager/quota/quotamanager.go | 35 ++----------------- 2 files changed, 8 insertions(+), 33 deletions(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 009192998..a064141cc 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -1029,6 +1029,8 @@ func (qjm *XController) chooseAgent(qj *arbv1.AppWrapper) string { //Now evaluate quota if qjm.serverOption.QuotaEnabled { if qjm.quotaManager != nil { + //quota trees faile to update with AW deletes, release quota before assigning + qjm.quotaManager.Release(qj) if fits, preemptAWs, _ := qjm.quotaManager.Fits(qj, qjAggrResources, proposedPreemptions); fits { klog.V(2).Infof("[chooseAgent] AppWrapper %s has enough quota.\n", qj.Name) qjm.preemptAWJobs(preemptAWs) @@ -1267,7 +1269,9 @@ func (qjm *XController) ScheduleNext() { if qjm.quotaManager != nil { var msg string var preemptAWs []*arbv1.AppWrapper - quotaFits, preemptAWs, msg = qjm.quotaManager.Fits(qj, aggqj, proposedPreemptions) + //quota trees fail to update with AW deletes, release quota before assigning + qjm.quotaManager.Release(qj) + quotaFits, preemptAWs, msg := qjm.quotaManager.Fits(qj, aggqj, proposedPreemptions) if quotaFits { klog.Infof("[ScheduleNext] HOL quota evaluation successful %s for %s activeQ=%t Unsched=%t &qj=%p Version=%s Status=%+v due to quota limits", qj.Name, time.Now().Sub(HOLStartTime), qjm.qjqueue.IfExistActiveQ(qj), qjm.qjqueue.IfExistUnschedulableQ(qj), qj, qj.ResourceVersion, qj.Status) // Set any jobs that are marked for preemption diff --git a/pkg/quotaplugins/quota-forest/quota-manager/quota/quotamanager.go b/pkg/quotaplugins/quota-forest/quota-manager/quota/quotamanager.go index fc0c2699b..36b1dceb2 100644 --- a/pkg/quotaplugins/quota-forest/quota-manager/quota/quotamanager.go +++ b/pkg/quotaplugins/quota-forest/quota-manager/quota/quotamanager.go @@ -22,12 +22,8 @@ import ( "strings" "sync" - clientset "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned" "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/quotaplugins/quota-forest/quota-manager/quota/core" "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/quotaplugins/quota-forest/quota-manager/quota/utils" - "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/quotaplugins/util" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/rest" "k8s.io/klog/v2" ) @@ -437,34 +433,9 @@ func (m *Manager) AllocateForest(forestName string, consumerID string) (response if consumerInfo == nil { return nil, fmt.Errorf("consumer %s does not exist, create and add first", consumerID) } - //allocation can pass if consumers inside the forest are not released - //make sure that appwrapper (consumer) does not exist in target namespace - namespace, awName := util.ParseId(consumerID) - config, err := rest.InClusterConfig() - if err != nil { - klog.Errorf("unable to retrieve in-cluster kubeconfig %v", err) - } - arbclients := clientset.NewForConfigOrDie(config) - awQueried, err := arbclients.ArbV1().AppWrappers(namespace).Get(awName, v1.GetOptions{}) - var awExistsInNamespace bool = false - if err != nil { - klog.Errorf("Unable to query consumer %v,", consumerID) - //force set to true - awExistsInNamespace = true - } - if awQueried.Name == awName { - awExistsInNamespace = true - klog.V(5).Infof("[AllocateForest] Found consumer %v in namespace", consumerID) - } - if awExistsInNamespace { - if forestController.IsConsumerAllocated(consumerID) { - return nil, fmt.Errorf("consumer %s already allocated on forest %s", consumerID, forestName) - } - } else { - if forestController.IsConsumerAllocated(consumerID) { - klog.V(4).Infof("[AllocateForest] Found consumer %v in quota tree but didn't find its matching AppWrapper. Deleting consumer from the tree", consumerID) - m.DeAllocateForest(forestName, consumerID) - } + + if forestController.IsConsumerAllocated(consumerID) { + return nil, fmt.Errorf("consumer %s already allocated on forest %s", consumerID, forestName) } resourceNames := forestController.GetResourceNames() From ebcba6e4644726c6bd1dde36ebe5cbefcca68c09 Mon Sep 17 00:00:00 2001 From: Abhishek Malvankar Date: Thu, 25 May 2023 07:04:21 -0400 Subject: [PATCH 06/10] qm test config change --- test/e2e-kuttl/quota-forest/04-install.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/test/e2e-kuttl/quota-forest/04-install.yaml b/test/e2e-kuttl/quota-forest/04-install.yaml index 93595eb19..6c23f438c 100644 --- a/test/e2e-kuttl/quota-forest/04-install.yaml +++ b/test/e2e-kuttl/quota-forest/04-install.yaml @@ -13,6 +13,17 @@ spec: metadata: {} Items: [] GenericItems: + - replicas: 1 + custompodresources: + - replicas: 1 + requests: + cpu: 600m + memory: 95Mi + nvidia.com/gpu: 0 + limits: + cpu: 600m + memory: 95Mi + nvidia.com/gpu: 0 - metadata: {} replicas: 1 generictemplate: From a27ae908dbdbbe274a77e43871da920447fb933a Mon Sep 17 00:00:00 2001 From: Abhishek Malvankar Date: Thu, 25 May 2023 07:28:26 -0400 Subject: [PATCH 07/10] remove custompodresources --- test/e2e-kuttl/quota-forest/04-install.yaml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/test/e2e-kuttl/quota-forest/04-install.yaml b/test/e2e-kuttl/quota-forest/04-install.yaml index 6c23f438c..93595eb19 100644 --- a/test/e2e-kuttl/quota-forest/04-install.yaml +++ b/test/e2e-kuttl/quota-forest/04-install.yaml @@ -13,17 +13,6 @@ spec: metadata: {} Items: [] GenericItems: - - replicas: 1 - custompodresources: - - replicas: 1 - requests: - cpu: 600m - memory: 95Mi - nvidia.com/gpu: 0 - limits: - cpu: 600m - memory: 95Mi - nvidia.com/gpu: 0 - metadata: {} replicas: 1 generictemplate: From 630a04129965e5aebe2ba453492df29885f90923 Mon Sep 17 00:00:00 2001 From: Abhishek Malvankar Date: Thu, 25 May 2023 07:59:33 -0400 Subject: [PATCH 08/10] fix resorurce unit --- test/e2e-kuttl/quota-forest/06-install.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/e2e-kuttl/quota-forest/06-install.yaml b/test/e2e-kuttl/quota-forest/06-install.yaml index 985cc197b..c2a76f7fc 100644 --- a/test/e2e-kuttl/quota-forest/06-install.yaml +++ b/test/e2e-kuttl/quota-forest/06-install.yaml @@ -43,8 +43,16 @@ spec: - while true; do sleep 10; done resources: requests: +<<<<<<< HEAD cpu: 600m memory: 95Mi limits: cpu: 600m memory: 95Mi +======= + cpu: "0.6m" + memory: "95Mi" + limits: + cpu: "0.6m" + memory: "95Mi" +>>>>>>> 7d1d38b9 (fix resorurce unit) From bdacd6bbbc2ce61fff488889c70a02e6c478aac9 Mon Sep 17 00:00:00 2001 From: Abhishek Malvankar Date: Thu, 25 May 2023 08:23:27 -0400 Subject: [PATCH 09/10] quota config changes --- test/e2e-kuttl/install-quota-subtree.yaml | 1 + test/e2e-kuttl/quota-forest/06-install.yaml | 8 -------- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/test/e2e-kuttl/install-quota-subtree.yaml b/test/e2e-kuttl/install-quota-subtree.yaml index 8580ab2ca..1299cf4ba 100644 --- a/test/e2e-kuttl/install-quota-subtree.yaml +++ b/test/e2e-kuttl/install-quota-subtree.yaml @@ -40,6 +40,7 @@ spec: children: - name: gold quotas: + hardLimit: false requests: cpu: 1075m memory: 450Mi diff --git a/test/e2e-kuttl/quota-forest/06-install.yaml b/test/e2e-kuttl/quota-forest/06-install.yaml index c2a76f7fc..985cc197b 100644 --- a/test/e2e-kuttl/quota-forest/06-install.yaml +++ b/test/e2e-kuttl/quota-forest/06-install.yaml @@ -43,16 +43,8 @@ spec: - while true; do sleep 10; done resources: requests: -<<<<<<< HEAD cpu: 600m memory: 95Mi limits: cpu: 600m memory: 95Mi -======= - cpu: "0.6m" - memory: "95Mi" - limits: - cpu: "0.6m" - memory: "95Mi" ->>>>>>> 7d1d38b9 (fix resorurce unit) From 8375e7b2942b413394aee1cc30bb3be753553988 Mon Sep 17 00:00:00 2001 From: Abhishek Malvankar Date: Thu, 25 May 2023 08:59:15 -0400 Subject: [PATCH 10/10] increase priority --- test/e2e-kuttl/quota-forest/06-install.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e-kuttl/quota-forest/06-install.yaml b/test/e2e-kuttl/quota-forest/06-install.yaml index 985cc197b..fb7ba84ac 100644 --- a/test/e2e-kuttl/quota-forest/06-install.yaml +++ b/test/e2e-kuttl/quota-forest/06-install.yaml @@ -9,7 +9,7 @@ metadata: spec: service: spec: {} - priority: 1000 + priority: 10000 resources: metadata: {} GenericItems: