Skip to content

Commit 28fe1f4

Browse files
authored
Address race condition in XController variable (#330)
* Update Dockerfile: * To not to use root user to build * To allow for passing of golang build args Updated Makefile: * To selectively run the generate code * To allow for passing of golang build args * Updates to ensure that docker builds flags are used Update to the e2e script to allow for not caching test runs and cluster setup * Updates to e2e script and travis for log level Fixed 1 race condition issue. * Reverted needless changes. * Documentation updates. Minor fixes to make file. * Fixed bug in docker file Reduced noise in e2e test. * Fixed potential regression. * Added back Dockerfile
1 parent 99bd557 commit 28fe1f4

File tree

12 files changed

+233
-112
lines changed

12 files changed

+233
-112
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,3 +125,6 @@ kubernetes.tar.gz
125125
/bazel-*
126126
*.pyc
127127

128+
# .devcontainer files
129+
.devcontainer
130+

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ install: []
1717

1818

1919
before_script:
20-
- export TEST_LOG_LEVEL=4
20+
- export TEST_LOG_LEVEL=2
2121

2222
script:
2323
- BLUE='\033[34m'

Dockerfile

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
FROM registry.access.redhat.com/ubi8/go-toolset:1.18.10-1 AS BUILDER
2-
USER root
1+
FROM registry.access.redhat.com/ubi8/go-toolset:1.18.10-1 AS BUILDER
2+
ARG GO_BUILD_ARGS
33
WORKDIR /workdir
4+
USER root
45

56
COPY Makefile Makefile
67
COPY go.mod go.mod
@@ -10,8 +11,10 @@ COPY pkg pkg
1011
COPY hack hack
1112
COPY CONTROLLER_VERSION CONTROLLER_VERSION
1213

13-
RUN cd /workdir && curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl && chmod +x kubectl
14-
RUN make mcad-controller
14+
RUN cd /workdir && curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/$(go env GOARCH)/kubectl && chmod +x kubectl
15+
ENV GO_BUILD_ARGS=$GO_BUILD_ARGS
16+
RUN echo "Go build args: $GO_BUILD_ARGS" && \
17+
make mcad-controller
1518

1619
FROM registry.access.redhat.com/ubi8/ubi-minimal:latest
1720

Makefile

Lines changed: 27 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@ VERSION_FILE=./CONTROLLER_VERSION
44
RELEASE_VER=v$(shell $(CAT_CMD) $(VERSION_FILE))
55
CURRENT_DIR=$(shell pwd)
66
GIT_BRANCH:=$(shell git symbolic-ref --short HEAD 2>&1 | grep -v fatal)
7-
LOCAL_BUILD_ARGS ?= -race
7+
#define the GO_BUILD_ARGS if you need to pass additional arguments to the go build
8+
GO_BUILD_ARGS?=
9+
810
# Reset branch name if this a Travis CI environment
911
ifneq ($(strip $(TRAVIS_BRANCH)),)
1012
GIT_BRANCH:=${TRAVIS_BRANCH}
@@ -29,8 +31,13 @@ TAG:=${TAG}${RELEASE_VER}
2931

3032
# Build the controler executalbe for use in docker image build
3133
mcad-controller: init generate-code
34+
ifeq ($(strip $(GO_BUILD_ARGS)),)
3235
$(info Compiling controller)
3336
CGO_ENABLED=0 go build -o ${BIN_DIR}/mcad-controller ./cmd/kar-controllers/
37+
else
38+
$(info Compiling controller with build arguments: '${GO_BUILD_ARGS}')
39+
go build $(GO_BUILD_ARGS) -o ${BIN_DIR}/mcad-controller ./cmd/kar-controllers/
40+
endif
3441

3542
print-global-variables:
3643
$(info "---")
@@ -39,6 +46,7 @@ print-global-variables:
3946
$(info " "GIT_BRANCH="$(GIT_BRANCH)")
4047
$(info " "RELEASE_VER="$(RELEASE_VER)")
4148
$(info " "TAG="$(TAG)")
49+
$(info " "GO_BUILD_ARGS="$(GO_BUILD_ARGS)")
4250
$(info "---")
4351

4452
verify: generate-code
@@ -53,30 +61,41 @@ verify-tag-name: print-global-variables
5361
# Check for invalid tag name
5462
t=${TAG} && [ $${#t} -le 128 ] || { echo "Target name $$t has 128 or more chars"; false; }
5563

56-
generate-code:
57-
$(info Compiling deepcopy-gen...)
58-
go build -o ${BIN_DIR}/deepcopy-gen ./cmd/deepcopy-gen/
64+
generate-code: pkg/apis/controller/v1beta1/zz_generated.deepcopy.go
65+
66+
pkg/apis/controller/v1beta1/zz_generated.deepcopy.go: ${BIN_DIR}/deepcopy-gen
5967
$(info Generating deepcopy...)
6068
${BIN_DIR}/deepcopy-gen -i ./pkg/apis/controller/v1beta1/ -O zz_generated.deepcopy
6169

62-
images: verify-tag-name
70+
${BIN_DIR}/deepcopy-gen:
71+
$(info Compiling deepcopy-gen...)
72+
go build -o ${BIN_DIR}/deepcopy-gen ./cmd/deepcopy-gen/
73+
74+
images: verify-tag-name generate-code
6375
$(info List executable directory)
6476
$(info repo id: ${git_repository_id})
6577
$(info branch: ${GIT_BRANCH})
6678
$(info Build the docker image)
79+
ifeq ($(strip $(GO_BUILD_ARGS)),)
6780
docker build --quiet --no-cache --tag mcad-controller:${TAG} -f ${CURRENT_DIR}/Dockerfile ${CURRENT_DIR}
81+
else
82+
docker build --no-cache --tag mcad-controller:${TAG} --build-arg GO_BUILD_ARGS=$(GO_BUILD_ARGS) -f ${CURRENT_DIR}/Dockerfile ${CURRENT_DIR}
83+
endif
6884

69-
images-podman: verify-tag-name
85+
images-podman: verify-tag-name generate-code
7086
$(info List executable directory)
7187
$(info repo id: ${git_repository_id})
7288
$(info branch: ${GIT_BRANCH})
73-
ls -l ${CURRENT_DIR}/_output/bin
7489
$(info Build the docker image)
90+
ifeq ($(strip $(GO_BUILD_ARGS)),)
7591
podman build --quiet --no-cache --tag mcad-controller:${TAG} -f ${CURRENT_DIR}/Dockerfile ${CURRENT_DIR}
92+
else
93+
podman build --no-cache --tag mcad-controller:${TAG} --build-arg GO_BUILD_ARGS=$(GO_BUILD_ARGS) -f ${CURRENT_DIR}/Dockerfile ${CURRENT_DIR}
94+
endif
7695

7796
push-images: verify-tag-name
7897
ifeq ($(strip $(quay_repository)),)
79-
$(info No registry information provide. To push images to a docker registry please set)
98+
$(info No registry information provided. To push images to a docker registry please set)
8099
$(info environment variables: quay_repository, quay_token, and quay_id. Environment)
81100
$(info variables do not need to be set for github Travis CICD.)
82101
else
@@ -106,14 +125,6 @@ else
106125
hack/run-e2e-kind.sh ${quay_repository}/mcad-controller ${TAG}
107126
endif
108127

109-
110-
# Build the controller executable for use on the local host and using local build args
111-
# the default for local build args is `-race` to turn race detection, this is not to be used
112-
# inside the docker containers.
113-
mcad-controller-local: init generate-code
114-
$(info Compiling controller)
115-
go build ${LOCAL_BUILD_ARGS} -o ${BIN_DIR}/mcad-controller-local ./cmd/kar-controllers/
116-
117128
coverage:
118129
# KUBE_COVER=y hack/make-rules/test.sh $(WHAT) $(TESTS)
119130

doc/build/build.md

Lines changed: 31 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
# Multi-Cluster-App-Dispatcher Controller Build Instructions
22

3-
This document will show how to build the `Multi-Cluster-App-Deployer` (`MCAD`) Kubernetes Controller that operates on an `AppWrapper` kubernetes custom resource definition. Instructions are for the [master](https://github.com/IBM/multi-cluster-app-dispatcher/tree/master) branch.
3+
This document will show how to build the `Multi-Cluster-App-Dispatcher` (`MCAD`) Kubernetes Controller that operates on an `AppWrapper` kubernetes custom resource definition. Instructions are for the [main](https://github.com/project-codeflare/multi-cluster-app-dispatcher/tree/main) branch.
44

55
## 1. Pre-condition
66

77
### Docker Environment
88

9-
To build `Multi-Cluster-App-Deployer`, a running Docker environment must be available. Here is a document on [Getting Started with Docker](https://www.docker.com/get-started).
9+
To build `Multi-Cluster-App-Dispatcher`, a running Docker environment must be available. Here is a document on [Getting Started with Docker](https://www.docker.com/get-started). Podman image builds are also supported.
1010

1111
### Clone Multi-Cluster-App-Deployer Git Repo
1212

@@ -36,8 +36,8 @@ $
3636
3737
To build the controller and to run the end to end tests locally you will need to have the following software installed:
3838
39-
* `Go` (version 1.16) -- the controller will compile and run with later versions, but currently supported version is 1.16
40-
* `kind` (version 0.11) -- later versions will work fine
39+
* `Go` (version 1.18)
40+
* `kind` (version 0.18)
4141
* `kubectl`
4242
* `helm` - version 3.0 or later
4343
* `make`
@@ -56,30 +56,25 @@ To to build the executable, execute:
5656
#build for linux OS and for use inside docker image
5757
multi-cluster-app-dispatcher $ make mcad-controller
5858
...
59-
Compiling deepcopy-gen...
60-
Generating deepcopy...
61-
go build -o _output/bin/deepcopy-gen ./cmd/deepcopy-gen/
62-
_output/bin/deepcopy-gen -i ./pkg/apis/controller/v1beta1/ -O zz_generated.deepcopy
63-
Compiling controller
64-
CGO_ENABLED=0 GOOS="linux" go build -o _output/bin/mcad-controller ./cmd/kar-controllers/
65-
66-
#build for local testing purposes, by default enable the race conditions detector
67-
multi-cluster-app-dispatcher $ make mcad-controller-local
68-
...
6959
mkdir -p _output/bin
70-
Compiling deepcopy-gen...
71-
Generating deepcopy...
72-
go build -o _output/bin/deepcopy-gen ./cmd/deepcopy-gen/
73-
_output/bin/deepcopy-gen -i ./pkg/apis/controller/v1beta1/ -O zz_generated.deepcopy
7460
Compiling controller
75-
go build -race -o _output/bin/mcad-controller-local ./cmd/kar-controllers/
61+
CGO_ENABLED=0 go build -o _output/bin/mcad-controller ./cmd/kar-controllers/
7662
```
7763
78-
Ensure the executables: `deepcopy-gen` and `mcad-controllers` are created in the target output directory:
64+
Ensure the executable `mcad-controllers` are created in the target output directory:
7965
8066
```bash
8167
multi-cluster-app-dispatcher $ ls _output/bin
82-
deepcopy-gen mcad-controller
68+
mcad-controller
69+
```
70+
71+
If you want pass additional args to the `go build`, define add them to the `GO_BUILD_ARGS` environment variable. This feature is useful if you want to compile the executable with the race condition detector turned on. To turn on the the race detector in your executable, execute:
72+
73+
```bash
74+
make mcad-controller GO_BUILD_ARGS=-race
75+
mkdir -p _output/bin
76+
Compiling controller with build arguments: '-race'
77+
go build -race -o _output/bin/mcad-controller ./cmd/kar-controllers/
8378
```
8479
8580
### Build the Multi-Cluster-App-Dispatcher Image
@@ -92,22 +87,22 @@ From the root directory of the repository:
9287
# With docker daemon running
9388
multi-cluster-app-dispatcher % make images
9489
....
95-
# output from main branch, MacOS build, local file names replaced with XXXXXXXXXX
90+
make images
9691
"---"
9792
"MAKE GLOBAL VARIABLES:"
9893
" "BIN_DIR="_output/bin"
9994
" "GIT_BRANCH="main"
100-
" "RELEASE_VER="v1.29.55"
101-
" "TAG="main-v1.29.55"
95+
" "RELEASE_VER="v1.29.57"
96+
" "TAG="main-v1.29.57"
97+
" "GO_BUILD_ARGS=""
10298
"---"
10399
# Check for invalid tag name
104-
t=main-v1.29.55 && [ ${#t} -le 128 ] || { echo "Target name $t has 128 or more chars"; false; }
100+
t=main-v1.29.57 && [ ${#t} -le 128 ] || { echo "Target name $t has 128 or more chars"; false; }
105101
List executable directory
106102
repo id:
107103
branch: main
108104
Build the docker image
109-
docker build --quiet --no-cache --tag mcad-controller:main-v1.29.55 -f XXXXXXXXXX/multi-cluster-app-dispatcher/Dockerfile XXXXXXXXXX/multi-cluster-app-dispatcher
110-
sha256:6871c150701280abc29baa14aa639791cefb9ba4b61177ab4faf5a43bdfcc4e4
105+
docker build --quiet --no-cache --tag mcad-controller:main-v1.29.57 -f XXXXXX/multi-cluster-app-dispatcher/Dockerfile XXXXX/multi-cluster-app-dispatcher
111106
112107
#Using podman
113108
make images-podman
@@ -117,37 +112,35 @@ make images-podman
117112
"MAKE GLOBAL VARIABLES:"
118113
" "BIN_DIR="_output/bin"
119114
" "GIT_BRANCH="main"
120-
" "RELEASE_VER="v1.29.55"
121-
" "TAG="main-v1.29.55"
115+
" "RELEASE_VER="v1.29.57"
116+
" "TAG="main-v1.29.57"
117+
" "GO_BUILD_ARGS=""
122118
"---"
123119
# Check for invalid tag name
124-
t=main-v1.29.55 && [ ${#t} -le 128 ] || { echo "Target name $t has 128 or more chars"; false; }
120+
t=main-v1.29.57 && [ ${#t} -le 128 ] || { echo "Target name $t has 128 or more chars"; false; }
125121
List executable directory
126122
repo id:
127123
branch: main
128124
Build the docker image
129-
ls -l XXXXXXXXXX/multi-cluster-app-dispatcher/_output/bin
130-
total 130144
131-
-rwxr-xr-x 1 laurentiu.bradin staff 8238498 Apr 6 15:19 deepcopy-gen
132-
-rwxr-xr-x 1 laurentiu.bradin staff 58391090 Apr 6 15:19 mcad-controller
133-
podman build --quiet --no-cache --tag mcad-controller:issue_315_small_changes-v1.29.55 -f XXXXXXXXXX/multi-cluster-app-dispatcher/Dockerfile XXXXXXXXXX/multi-cluster-app-dispatcher
134-
f784707e8982399ef7ef66e3d8a09b669e6deb17990d174400338813fb13c505
125+
podman build --quiet --no-cache --tag mcad-controller:main-v1.29.57 -f XXXXX/multi-cluster-app-dispatcher/Dockerfile XXXXX/multi-cluster-app-dispatcher
135126
```
136127
128+
The `GO_BUILD_ARGS` use is also supported by the images builds with either `docker` and `podman`. To turn on the race condition detector in image's executable execute: `make images GO_BUILD_ARGS=-race`
129+
137130
### Push the Multi-Cluster-App-Dispatcher Image to an Image Repository
138131
139132
The following example assumes an available `<repository>/mcad-controller` on [Docker Hub](https://hub.docker.com) and using image version `v1.14`
140133
141134
```bash
142135
docker login
143-
docker push <respository>/mcad-controller:v1.14
136+
docker push <repository>/mcad-controller:v1.14
144137
```
145138
146139
The same can be done with [Quay](quay.io)
147140
148141
```bash
149142
docker login quay.io
150-
docker push <quay_respository>/mcad-controller:v1.14
143+
docker push <quay_repository>/mcad-controller:v1.14
151144
```
152145
153146
Refer to [deployment](../deploy/deployment.md) on how to deploy the `multi-cluster-app-dispatcher` as a controller in Kubernetes.

hack/run-e2e-kind.sh

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
# limitations under the License.
2929

3030
export ROOT_DIR="$(dirname "$(dirname "$(readlink -fn "$0")")")"
31-
export LOG_LEVEL=3
31+
export LOG_LEVEL=${TEST_LOG_LEVEL:-2}
3232
export CLEANUP_CLUSTER=${CLEANUP_CLUSTER:-"true"}
3333
export CLUSTER_CONTEXT="--name test"
3434
# Using older image due to older version of kubernetes cluster"
@@ -334,7 +334,7 @@ function kube-test-env-up {
334334
# start mcad controller
335335
echo "Starting MCAD Controller..."
336336
echo "helm install mcad-controller namespace kube-system wait set loglevel=2 set resources.requests.cpu=1000m set resources.requests.memory=1024Mi set resources.limits.cpu=4000m set resources.limits.memory=4096Mi set image.repository=$IMAGE_REPOSITORY_MCAD set image.tag=$IMAGE_TAG_MCAD set image.pullPolicy=$MCAD_IMAGE_PULL_POLICY"
337-
helm upgrade --install mcad-controller ${ROOT_DIR}/deployment/mcad-controller --namespace kube-system --wait --set loglevel=2 --set resources.requests.cpu=1000m --set resources.requests.memory=1024Mi --set resources.limits.cpu=4000m --set resources.limits.memory=4096Mi --set configMap.name=mcad-controller-configmap --set configMap.podCreationTimeout='"120000"' --set configMap.quotaEnabled='"false"' --set coscheduler.rbac.apiGroup=scheduling.sigs.k8s.io --set coscheduler.rbac.resource=podgroups --set image.repository=$IMAGE_REPOSITORY_MCAD --set image.tag=$IMAGE_TAG_MCAD --set image.pullPolicy=$MCAD_IMAGE_PULL_POLICY
337+
helm upgrade --install mcad-controller ${ROOT_DIR}/deployment/mcad-controller --namespace kube-system --wait --set loglevel=${LOG_LEVEL} --set resources.requests.cpu=1000m --set resources.requests.memory=1024Mi --set resources.limits.cpu=4000m --set resources.limits.memory=4096Mi --set configMap.name=mcad-controller-configmap --set configMap.podCreationTimeout='"120000"' --set configMap.quotaEnabled='"false"' --set coscheduler.rbac.apiGroup=scheduling.sigs.k8s.io --set coscheduler.rbac.resource=podgroups --set image.repository=$IMAGE_REPOSITORY_MCAD --set image.tag=$IMAGE_TAG_MCAD --set image.pullPolicy=$MCAD_IMAGE_PULL_POLICY
338338

339339
sleep 10
340340
echo "Listing MCAD Controller Helm Chart and Pod YAML..."
@@ -362,6 +362,11 @@ function kube-test-env-up {
362362
done
363363
echo "kubectl uncordon test-worker"
364364
kubectl uncordon test-worker
365+
echo "Waiting for pod in the kube-system namespace to become ready"
366+
while [[ $(kubectl get pods -n kube-system -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' | tr ' ' '\n' | sort -u) != "True" ]]
367+
do
368+
echo -n "." && sleep 1;
369+
done
365370

366371
# Show available resources of cluster nodes
367372
echo "---"
@@ -385,4 +390,4 @@ kind-up-cluster
385390
kube-test-env-up
386391

387392
echo "==========================>>>>> Running E2E tests... <<<<<=========================="
388-
go test ./test/e2e -v -timeout 75m
393+
go test ./test/e2e -v -timeout 75m -count=1
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
package queuejob
2+
3+
import (
4+
"strings"
5+
"sync"
6+
7+
arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1"
8+
)
9+
10+
// ActiveAppWrapper is current scheduling AppWrapper in the XController struct.
11+
// Its sole purpose is provide a thread safe way to for use the XController logic
12+
type ActiveAppWrapper struct {
13+
activeAW *arbv1.AppWrapper
14+
activeAWMutex *sync.RWMutex
15+
}
16+
17+
// NewActiveAppWrapper
18+
func NewActiveAppWrapper() *ActiveAppWrapper {
19+
return &ActiveAppWrapper{
20+
activeAW: nil,
21+
activeAWMutex: &sync.RWMutex{},
22+
}
23+
}
24+
25+
// AtomicSet as is name implies, atomically sets the activeAW to the new value
26+
func (aw *ActiveAppWrapper) AtomicSet(newValue *arbv1.AppWrapper) {
27+
aw.activeAWMutex.Lock()
28+
defer aw.activeAWMutex.Unlock()
29+
aw.activeAW = newValue
30+
}
31+
32+
// IsActiveAppWrapper safely performs the comparison that was done inside the if block
33+
// at line 1977 in the queuejob_controller_ex.go
34+
// The code looked like this:
35+
//
36+
// if !qj.Status.CanRun && qj.Status.State == arbv1.AppWrapperStateEnqueued &&
37+
// !cc.qjqueue.IfExistUnschedulableQ(qj) && !cc.qjqueue.IfExistActiveQ(qj) {
38+
// // One more check to ensure AW is not the current active schedule object
39+
// if cc.schedulingAW == nil ||
40+
// (strings.Compare(cc.schedulingAW.Namespace, qj.Namespace) != 0 &&
41+
// strings.Compare(cc.schedulingAW.Name, qj.Name) != 0) {
42+
// cc.qjqueue.AddIfNotPresent(qj)
43+
// klog.V(3).Infof("[manageQueueJob] Recovered AppWrapper %s%s - added to active queue, Status=%+v",
44+
// qj.Namespace, qj.Name, qj.Status)
45+
// return nil
46+
// }
47+
// }
48+
func (aw *ActiveAppWrapper) IsActiveAppWrapper(name, namespace string) bool {
49+
aw.activeAWMutex.RLock()
50+
defer aw.activeAWMutex.RUnlock()
51+
return aw.activeAW == nil ||
52+
(strings.Compare(aw.activeAW.Namespace, namespace) != 0 &&
53+
strings.Compare(aw.activeAW.Name, name) != 0)
54+
}

0 commit comments

Comments
 (0)