Skip to content

Commit 4545804

Browse files
authored
Automate Zero Downtime Scale tests (#2259)
Problem: We want our NFR tests to be fully automated to save developer time for each release cycle, and have a repetitive way of running the tests. Solution: Automate the zero downtime scaling test. No longer collecting logs as done previously, because error logs would be unreliable to collect for pods that are scaling down (we don't have persistence that we can easily use in the automation to gather historic logs). Ultimately we are still ensuring that traffic is flowing and status updates occur, which are the important pieces here.
1 parent 25c6924 commit 4545804

File tree

139 files changed

+512
-88555
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

139 files changed

+512
-88555
lines changed

tests/Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ run-tests-on-vm: ## Run the functional tests on a GCP VM
9797

9898
.PHONY: nfr-test
9999
nfr-test: ## Run the NFR tests on a GCP VM
100-
NFR=true bash scripts/run-tests-gcp-vm.sh
100+
NFR=true CI=$(CI) bash scripts/run-tests-gcp-vm.sh
101101

102102
.PHONY: start-longevity-test
103103
start-longevity-test: export START_LONGEVITY=true
@@ -110,7 +110,7 @@ stop-longevity-test: nfr-test ## Stop the longevity test and collects results
110110
.PHONY: .vm-nfr-test
111111
.vm-nfr-test: ## Runs the NFR tests on the GCP VM (called by `nfr-test`)
112112
go run github.com/onsi/ginkgo/v2/ginkgo --randomize-all --randomize-suites --keep-going --fail-on-pending --trace -r -v \
113-
--label-filter "nfr" $(GINKGO_FLAGS) ./suite -- --gateway-api-version=$(GW_API_VERSION) \
113+
--label-filter "nfr" $(GINKGO_FLAGS) --timeout 3h ./suite -- --gateway-api-version=$(GW_API_VERSION) \
114114
--gateway-api-prev-version=$(GW_API_PREV_VERSION) --image-tag=$(TAG) --version-under-test=$(NGF_VERSION) \
115115
--plus-enabled=$(PLUS_ENABLED) --ngf-image-repo=$(PREFIX) --nginx-image-repo=$(NGINX_PREFIX) --nginx-plus-image-repo=$(NGINX_PLUS_PREFIX) \
116116
--pull-policy=$(PULL_POLICY) --service-type=$(GW_SERVICE_TYPE) \

tests/README.md

Lines changed: 0 additions & 2 deletions

tests/framework/resourcemanager.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -744,3 +744,32 @@ func (rm *ResourceManager) WaitForPodsToBeReadyWithCount(ctx context.Context, na
744744
},
745745
)
746746
}
747+
748+
// WaitForGatewayObservedGeneration waits for the provided Gateway's ObservedGeneration to equal the expected value.
749+
func (rm *ResourceManager) WaitForGatewayObservedGeneration(
750+
ctx context.Context,
751+
namespace,
752+
name string,
753+
generation int,
754+
) error {
755+
return wait.PollUntilContextCancel(
756+
ctx,
757+
500*time.Millisecond,
758+
true, /* poll immediately */
759+
func(ctx context.Context) (bool, error) {
760+
var gw v1.Gateway
761+
key := types.NamespacedName{Namespace: namespace, Name: name}
762+
if err := rm.K8sClient.Get(ctx, key, &gw); err != nil {
763+
return false, err
764+
}
765+
766+
for _, cond := range gw.Status.Conditions {
767+
if cond.ObservedGeneration == int64(generation) {
768+
return true, nil
769+
}
770+
}
771+
772+
return false, nil
773+
},
774+
)
775+
}

tests/scripts/run-tests-gcp-vm.sh

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,13 @@ if [ $retcode -ne 0 ]; then
2323
fi
2424

2525
if [ "${NFR}" = "true" ]; then
26-
gcloud compute scp --zone ${GKE_CLUSTER_ZONE} --project=${GKE_PROJECT} --recurse username@${RESOURCE_NAME}:~/nginx-gateway-fabric/tests/results .
26+
## Use rsync if running locally (faster); otherwise if in the pipeline don't download an SSH config
27+
if [ "${CI}" = "false" ]; then
28+
gcloud compute config-ssh --ssh-config-file ngf-gcp.ssh > /dev/null
29+
rsync -ave 'ssh -F ngf-gcp.ssh' username@${RESOURCE_NAME}.${GKE_CLUSTER_ZONE}.${GKE_PROJECT}:~/nginx-gateway-fabric/tests/results .
30+
else
31+
gcloud compute scp --zone ${GKE_CLUSTER_ZONE} --project=${GKE_PROJECT} --recurse username@${RESOURCE_NAME}:~/nginx-gateway-fabric/tests/results .
32+
fi
2733
fi
2834

2935
## If tearing down the longevity test, we need to collect logs from gcloud and add to the results

tests/suite/manifests/ngf-upgrade/values.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,3 @@ affinity:
2525
labelSelector:
2626
matchLabels:
2727
app.kubernetes.io/name: nginx-gateway
28-
29-
service:
30-
annotations:
31-
networking.gke.io/load-balancer-type: "Internal"
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,11 @@
11
nginxGateway:
2-
image:
3-
repository: ghcr.io/nginxinc/nginx-gateway-fabric
4-
tag: edge # change this tag if you are testing a different version
5-
pullPolicy: IfNotPresent
62
lifecycle:
73
preStop:
84
exec:
95
command:
106
- /usr/bin/gateway
117
- sleep
128
- --duration=40s
13-
config:
14-
logging:
15-
level: debug
169

1710
nginx:
1811
lifecycle:
@@ -31,7 +24,3 @@ affinity:
3124
labelSelector:
3225
matchLabels:
3326
app.kubernetes.io/name: nginx-gateway
34-
35-
service:
36-
annotations:
37-
networking.gke.io/load-balancer-type: "Internal"
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
nginxGateway:
2+
lifecycle:
3+
preStop:
4+
exec:
5+
command:
6+
- /usr/bin/gateway
7+
- sleep
8+
- --duration=40s
9+
10+
nginx:
11+
lifecycle:
12+
preStop:
13+
exec:
14+
command:
15+
- /bin/sleep
16+
- "40"
17+
18+
terminationGracePeriodSeconds: 50

0 commit comments

Comments
 (0)