Skip to content

Commit a44bc16

Browse files
authored
Added e2e test to reproduce issue #340 (#397)
* Added e2e tests to emulate the condition. Minor fixes in the e2e script. * Incorporated feedback and tests from #398 * Added unit tests to emulate a long running consumer
1 parent ab6d742 commit a44bc16

File tree

17 files changed

+511
-8
lines changed

17 files changed

+511
-8
lines changed

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ module github.com/project-codeflare/multi-cluster-app-dispatcher
33
go 1.18
44

55
require (
6+
github.com/eapache/go-resiliency v1.3.0
67
github.com/emicklei/go-restful v2.16.0+incompatible
78
github.com/golang/protobuf v1.4.3
89
github.com/hashicorp/go-multierror v1.1.1

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@ github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3
9696
github.com/dustin/go-humanize v0.0.0-20171111073723-bb3d318650d4/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
9797
github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo=
9898
github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
99+
github.com/eapache/go-resiliency v1.3.0 h1:RRL0nge+cWGlxXbUzJ7yMcq6w2XBEr19dCN6HECGaT0=
100+
github.com/eapache/go-resiliency v1.3.0/go.mod h1:5yPzW0MIvSe0JDsv0v+DvcjEv2FyD6iZYSs1ZI+iQho=
99101
github.com/elazarl/goproxy v0.0.0-20180725130230-947c36da3153/go.mod h1:/Zj4wYkgs4iZTTu3o/KG3Itv/qCCa8VVMlb3i9OVuzc=
100102
github.com/emicklei/go-restful v0.0.0-20170410110728-ff4f55a20633/go.mod h1:otzb+WCGbkyDHkqmQmT5YD2WR4BBwUdeQoFo8l/7tVs=
101103
github.com/emicklei/go-restful v2.9.5+incompatible/go.mod h1:otzb+WCGbkyDHkqmQmT5YD2WR4BBwUdeQoFo8l/7tVs=

hack/run-e2e-kind.sh

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ export CLUSTER_CONTEXT="--name test"
3333
# Using older image due to older version of kubernetes cluster"
3434
export IMAGE_ECHOSERVER="kicbase/echo-server:1.0"
3535
export IMAGE_UBUNTU_LATEST="ubuntu:latest"
36+
export IMAGE_UBI_LATEST="registry.access.redhat.com/ubi8/ubi:latest"
3637
export KIND_OPT=${KIND_OPT:=" --config ${ROOT_DIR}/hack/e2e-kind-config.yaml"}
3738
export KA_BIN=_output/bin
3839
export WAIT_TIME="20s"
@@ -220,6 +221,13 @@ function kind-up-cluster {
220221
exit 1
221222
fi
222223

224+
docker pull ${IMAGE_UBI_LATEST}
225+
if [ $? -ne 0 ]
226+
then
227+
echo "Failed to pull ${IMAGE_UBI_LATEST}"
228+
exit 1
229+
fi
230+
223231
if [[ "$MCAD_IMAGE_PULL_POLICY" = "Always" ]]
224232
then
225233
docker pull ${IMAGE_MCAD}
@@ -236,7 +244,7 @@ function kind-up-cluster {
236244
fi
237245
docker images
238246

239-
for image in ${IMAGE_ECHOSERVER} ${IMAGE_UBUNTU_LATEST} ${IMAGE_MCAD}
247+
for image in ${IMAGE_ECHOSERVER} ${IMAGE_UBUNTU_LATEST} ${IMAGE_MCAD} ${IMAGE_UBI_LATEST}
240248
do
241249
kind load docker-image ${image} ${CLUSTER_CONTEXT}
242250
if [ $? -ne 0 ]

pkg/controller/quota/quotaforestmanager/qm_lib_backend_with_quotasubt_mgr.go

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -530,20 +530,19 @@ func (qm *QuotaManager) Fits(aw *arbv1.AppWrapper, awResDemands *clusterstateapi
530530
klog.V(4).Infof("[Fits] Sending quota allocation request: %#v ", consumerInfo)
531531
allocResponse, err := qm.quotaManagerBackend.AllocateForest(QuotaManagerForestName, consumerID)
532532
if err != nil {
533+
qm.removeConsumer(consumerID)
533534
klog.Errorf("[Fits] Error allocating consumer: %s/%s, err=%#v.", aw.Namespace, aw.Name, err)
534535
return false, nil, err.Error()
535536
}
536-
if allocResponse != nil && len(strings.TrimSpace(allocResponse.GetMessage())) > 0 {
537-
klog.Errorf("[Fits] Error allocating consumer: %s/%s, msg=%s, err=%#v.",
538-
aw.Namespace, aw.Name, allocResponse.GetMessage(), err)
539-
return false, nil, allocResponse.GetMessage()
540-
}
541537
klog.V(4).Infof("[Fits] allocation response received. Quota Allocated: %t, Message: '%s', Preempted app wrappers count: %d", allocResponse.IsAllocated(),
542538
strings.TrimSpace(allocResponse.GetMessage()), len(allocResponse.GetPreemptedIds()))
543539
doesFit := allocResponse.IsAllocated()
540+
if !doesFit {
541+
qm.removeConsumer(consumerID)
542+
return doesFit, preemptIds, strings.TrimSpace(allocResponse.GetMessage())
543+
}
544544
preemptIds = qm.getAppWrappers(allocResponse.GetPreemptedIds())
545-
546-
return doesFit, preemptIds, ""
545+
return doesFit, preemptIds, strings.TrimSpace(allocResponse.GetMessage())
547546
}
548547

549548
func (qm *QuotaManager) getAppWrappers(preemptIds []string) []*arbv1.AppWrapper {
@@ -614,3 +613,14 @@ func (qm *QuotaManager) Release(aw *arbv1.AppWrapper) bool {
614613

615614
return released
616615
}
616+
func (qm *QuotaManager) removeConsumer(consumerID string) {
617+
// removing the consumer to allow for the consumer to be added if and when
618+
// the function is called for the same app wrapper
619+
removed, err := qm.quotaManagerBackend.RemoveConsumer(consumerID)
620+
if err != nil {
621+
klog.Warningf("Failed to remove consumer %s, %#v", consumerID, err)
622+
}
623+
if !removed {
624+
klog.Warningf("Failed to remove consumer %s", consumerID)
625+
}
626+
}

pkg/quotaplugins/quota-forest/quota-manager/quota/quotamanager_test.go

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@ package quota_test
1919
import (
2020
"strings"
2121
"testing"
22+
"time"
2223

24+
"github.com/eapache/go-resiliency/retrier"
2325
"github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/quotaplugins/quota-forest/quota-manager/quota"
2426
"github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/quotaplugins/quota-forest/quota-manager/quota/utils"
2527
"github.com/stretchr/testify/assert"
@@ -225,3 +227,169 @@ func TestNewQuotaManagerConsumerAllocationRelease(t *testing.T) {
225227
})
226228
}
227229
}
230+
func TestQuotaManagerQuotaUsedLongRunningConsumers(t *testing.T) {
231+
forestName := "unit-test-1"
232+
qmManagerUnderTest := quota.NewManager()
233+
234+
err := qmManagerUnderTest.AddForest(forestName)
235+
assert.NoError(t, err, "No error expected when adding a forest")
236+
testTreeName, err := qmManagerUnderTest.AddTreeFromString(
237+
`{
238+
"kind": "QuotaTree",
239+
"metadata": {
240+
"name": "test-tree"
241+
},
242+
"spec": {
243+
"resourceNames": [
244+
"cpu",
245+
"memory"
246+
],
247+
"nodes": {
248+
"root": {
249+
"parent": "nil",
250+
"hard": "true",
251+
"quota": {
252+
"cpu": "10",
253+
"memory": "256"
254+
}
255+
},
256+
"gold": {
257+
"parent": "root",
258+
"hard": "true",
259+
"quota": {
260+
"cpu": "10",
261+
"memory": "256"
262+
}
263+
}
264+
}
265+
}
266+
}`)
267+
if err != nil {
268+
assert.Fail(t, "No error expected when adding a tree to forest")
269+
}
270+
err = qmManagerUnderTest.AddTreeToForest(forestName, testTreeName)
271+
assert.NoError(t, err, "No error expected when adding a tree from forest")
272+
modeSet := qmManagerUnderTest.SetMode(quota.Normal)
273+
assert.True(t, modeSet, "Setting the mode should not fail.")
274+
275+
// Define the test table
276+
var tests = []struct {
277+
name string
278+
consumer utils.JConsumer
279+
}{
280+
// the table itself
281+
{"Gold consumer 1",
282+
utils.JConsumer{
283+
Kind: "Consumer",
284+
MetaData: utils.JMetaData{
285+
Name: "gpld-consumer-data",
286+
},
287+
Spec: utils.JConsumerSpec{
288+
ID: "gold-consumer-1",
289+
Trees: []utils.JConsumerTreeSpec{
290+
{
291+
TreeName: testTreeName,
292+
GroupID: "gold",
293+
Request: map[string]int{
294+
"cpu": 10,
295+
"memory": 4,
296+
"gpu": 0,
297+
},
298+
Priority: 0,
299+
CType: 0,
300+
UnPreemptable: false,
301+
},
302+
},
303+
},
304+
},
305+
},
306+
// the table itself
307+
{"Gold consumer 2",
308+
utils.JConsumer{
309+
Kind: "Consumer",
310+
MetaData: utils.JMetaData{
311+
Name: "gpld-consumer-data",
312+
},
313+
Spec: utils.JConsumerSpec{
314+
ID: "gold-consumer-2",
315+
Trees: []utils.JConsumerTreeSpec{
316+
{
317+
TreeName: testTreeName,
318+
GroupID: "gold",
319+
Request: map[string]int{
320+
"cpu": 10,
321+
"memory": 4,
322+
"gpu": 0,
323+
},
324+
Priority: 0,
325+
CType: 0,
326+
UnPreemptable: false,
327+
},
328+
},
329+
},
330+
},
331+
},
332+
}
333+
// Execute tests in parallel
334+
for _, tc := range tests {
335+
tc := tc // capture range variable
336+
t.Run(tc.name, func(t *testing.T) {
337+
t.Parallel()
338+
// Get list of quota management tree IDs
339+
qmTreeIDs := qmManagerUnderTest.GetTreeNames()
340+
341+
consumerInfo, err := quota.NewConsumerInfo(tc.consumer)
342+
assert.NoError(t, err, "No error expected when building consumer")
343+
assert.Contains(t, qmTreeIDs, tc.consumer.Spec.Trees[0].TreeName)
344+
345+
retryAllocation := retrier.New(retrier.LimitedExponentialBackoff(10, 10*time.Millisecond, 1*time.Second),
346+
&AllocationClassifier{})
347+
348+
err = retryAllocation.Run(func() error {
349+
added, err2 := qmManagerUnderTest.AddConsumer(consumerInfo)
350+
assert.NoError(t, err2, "No error expected when adding consumer")
351+
assert.True(t, added, "Consumer is expected to be added")
352+
353+
response, err2 := qmManagerUnderTest.AllocateForest(forestName, consumerInfo.GetID())
354+
if err2 == nil {
355+
assert.Equal(t, 0, len(strings.TrimSpace(response.GetMessage())), "A empty response is expected")
356+
assert.True(t, response.IsAllocated(), "The allocation should succeed")
357+
} else {
358+
removed, err3 := qmManagerUnderTest.RemoveConsumer(consumerInfo.GetID())
359+
assert.NoError(t, err3, "No Error expected when removing consumer")
360+
assert.True(t, removed, "Removal of consumer should succeed")
361+
}
362+
return err2
363+
364+
})
365+
if err != nil {
366+
assert.Failf(t, "Allocation of quota should have succeed: '%s'", err.Error())
367+
}
368+
369+
//simulate a long running consumer that has quota allocated
370+
time.Sleep(10 * time.Millisecond)
371+
372+
deAllocated := qmManagerUnderTest.DeAllocateForest(forestName, consumerInfo.GetID())
373+
assert.True(t, deAllocated, "De-allocation expected to succeed")
374+
375+
removed, err := qmManagerUnderTest.RemoveConsumer(consumerInfo.GetID())
376+
assert.NoError(t, err, "No Error expected when removing consumer")
377+
assert.True(t, removed, "Removal of consumer should succeed")
378+
379+
})
380+
}
381+
382+
}
383+
384+
type AllocationClassifier struct {
385+
}
386+
387+
func (c *AllocationClassifier) Classify(err error) retrier.Action {
388+
if err == nil {
389+
return retrier.Succeed
390+
}
391+
if strings.TrimSpace(err.Error()) == "Failed to allocate quota on quota designation 'test-tree'" {
392+
return retrier.Retry
393+
}
394+
return retrier.Fail
395+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Verify CRDs existence
2+
apiVersion: apiextensions.k8s.io/v1
3+
kind: CustomResourceDefinition
4+
metadata:
5+
name: appwrappers.mcad.ibm.com
6+
status:
7+
acceptedNames:
8+
kind: AppWrapper
9+
listKind: AppWrapperList
10+
plural: appwrappers
11+
singular: appwrapper
12+
storedVersions:
13+
- v1beta1
14+
---
15+
apiVersion: apiextensions.k8s.io/v1
16+
kind: CustomResourceDefinition
17+
metadata:
18+
name: quotasubtrees.ibm.com
19+
status:
20+
acceptedNames:
21+
kind: QuotaSubtree
22+
singular: quotasubtree
23+
plural: quotasubtrees
24+
storedVersions:
25+
- v1
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Verify subtree creations
2+
apiVersion: ibm.com/v1
3+
kind: QuotaSubtree
4+
metadata:
5+
name: context-root
6+
namespace: kube-system
7+
labels:
8+
tree: quota_context
9+
---
10+
apiVersion: ibm.com/v1
11+
kind: QuotaSubtree
12+
metadata:
13+
name: service-root
14+
namespace: kube-system
15+
labels:
16+
tree: quota_service
17+
---
18+
apiVersion: ibm.com/v1
19+
kind: QuotaSubtree
20+
metadata:
21+
name: context-root-children
22+
namespace: kube-system
23+
labels:
24+
tree: quota_context
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Verify test namespace existence
2+
apiVersion: v1
3+
kind: Namespace
4+
metadata:
5+
name: quota-errors
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
apiVersion: v1
2+
kind: Namespace
3+
metadata:
4+
name: quota-errors
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Verify AppWrapper was dispatched and pod was created
2+
apiVersion: mcad.ibm.com/v1beta1
3+
kind: AppWrapper
4+
metadata:
5+
name: deployment-silver-lo-pri-1replica
6+
namespace: quota-errors
7+
labels:
8+
quota_context: "silver"
9+
quota_service: "service-root"
10+
status:
11+
state: Running
12+
---
13+
apiVersion: apps/v1
14+
kind: Deployment
15+
metadata:
16+
name: deployment-silver-lo-pri-1replica
17+
namespace: quota-errors
18+
labels:
19+
app: deployment-silver-lo-pri-1replica
20+
appwrapper.mcad.ibm.com: deployment-silver-lo-pri-1replica
21+
resourceName: deployment-silver-lo-pri-1replica
22+
status:
23+
availableReplicas: 1
24+
observedGeneration: 1
25+
readyReplicas: 1
26+
replicas: 1
27+
updatedReplicas: 1

0 commit comments

Comments
 (0)