Skip to content

Address race condition in XController variable #330

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Apr 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,6 @@ kubernetes.tar.gz
/bazel-*
*.pyc

# .devcontainer files
.devcontainer

2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ install: []


before_script:
- export TEST_LOG_LEVEL=4
- export TEST_LOG_LEVEL=2

script:
- BLUE='\033[34m'
Expand Down
11 changes: 7 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
FROM registry.access.redhat.com/ubi8/go-toolset:1.18.10-1 AS BUILDER
USER root
FROM registry.access.redhat.com/ubi8/go-toolset:1.18.10-1 AS BUILDER
ARG GO_BUILD_ARGS
WORKDIR /workdir
USER root

COPY Makefile Makefile
COPY go.mod go.mod
Expand All @@ -10,8 +11,10 @@ COPY pkg pkg
COPY hack hack
COPY CONTROLLER_VERSION CONTROLLER_VERSION

RUN cd /workdir && curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl && chmod +x kubectl
RUN make mcad-controller
RUN cd /workdir && curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/$(go env GOARCH)/kubectl && chmod +x kubectl
ENV GO_BUILD_ARGS=$GO_BUILD_ARGS
RUN echo "Go build args: $GO_BUILD_ARGS" && \
make mcad-controller

FROM registry.access.redhat.com/ubi8/ubi-minimal:latest

Expand Down
43 changes: 27 additions & 16 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ VERSION_FILE=./CONTROLLER_VERSION
RELEASE_VER=v$(shell $(CAT_CMD) $(VERSION_FILE))
CURRENT_DIR=$(shell pwd)
GIT_BRANCH:=$(shell git symbolic-ref --short HEAD 2>&1 | grep -v fatal)
LOCAL_BUILD_ARGS ?= -race
#define the GO_BUILD_ARGS if you need to pass additional arguments to the go build
GO_BUILD_ARGS?=

# Reset branch name if this a Travis CI environment
ifneq ($(strip $(TRAVIS_BRANCH)),)
GIT_BRANCH:=${TRAVIS_BRANCH}
Expand All @@ -29,8 +31,13 @@ TAG:=${TAG}${RELEASE_VER}

# Build the controler executalbe for use in docker image build
mcad-controller: init generate-code
ifeq ($(strip $(GO_BUILD_ARGS)),)
$(info Compiling controller)
CGO_ENABLED=0 go build -o ${BIN_DIR}/mcad-controller ./cmd/kar-controllers/
else
$(info Compiling controller with build arguments: '${GO_BUILD_ARGS}')
go build $(GO_BUILD_ARGS) -o ${BIN_DIR}/mcad-controller ./cmd/kar-controllers/
endif

print-global-variables:
$(info "---")
Expand All @@ -39,6 +46,7 @@ print-global-variables:
$(info " "GIT_BRANCH="$(GIT_BRANCH)")
$(info " "RELEASE_VER="$(RELEASE_VER)")
$(info " "TAG="$(TAG)")
$(info " "GO_BUILD_ARGS="$(GO_BUILD_ARGS)")
$(info "---")

verify: generate-code
Expand All @@ -53,30 +61,41 @@ verify-tag-name: print-global-variables
# Check for invalid tag name
t=${TAG} && [ $${#t} -le 128 ] || { echo "Target name $$t has 128 or more chars"; false; }

generate-code:
$(info Compiling deepcopy-gen...)
go build -o ${BIN_DIR}/deepcopy-gen ./cmd/deepcopy-gen/
generate-code: pkg/apis/controller/v1beta1/zz_generated.deepcopy.go

pkg/apis/controller/v1beta1/zz_generated.deepcopy.go: ${BIN_DIR}/deepcopy-gen
$(info Generating deepcopy...)
${BIN_DIR}/deepcopy-gen -i ./pkg/apis/controller/v1beta1/ -O zz_generated.deepcopy

images: verify-tag-name
${BIN_DIR}/deepcopy-gen:
$(info Compiling deepcopy-gen...)
go build -o ${BIN_DIR}/deepcopy-gen ./cmd/deepcopy-gen/

images: verify-tag-name generate-code
$(info List executable directory)
$(info repo id: ${git_repository_id})
$(info branch: ${GIT_BRANCH})
$(info Build the docker image)
ifeq ($(strip $(GO_BUILD_ARGS)),)
docker build --quiet --no-cache --tag mcad-controller:${TAG} -f ${CURRENT_DIR}/Dockerfile ${CURRENT_DIR}
else
docker build --no-cache --tag mcad-controller:${TAG} --build-arg GO_BUILD_ARGS=$(GO_BUILD_ARGS) -f ${CURRENT_DIR}/Dockerfile ${CURRENT_DIR}
endif

images-podman: verify-tag-name
images-podman: verify-tag-name generate-code
$(info List executable directory)
$(info repo id: ${git_repository_id})
$(info branch: ${GIT_BRANCH})
ls -l ${CURRENT_DIR}/_output/bin
$(info Build the docker image)
ifeq ($(strip $(GO_BUILD_ARGS)),)
podman build --quiet --no-cache --tag mcad-controller:${TAG} -f ${CURRENT_DIR}/Dockerfile ${CURRENT_DIR}
else
podman build --no-cache --tag mcad-controller:${TAG} --build-arg GO_BUILD_ARGS=$(GO_BUILD_ARGS) -f ${CURRENT_DIR}/Dockerfile ${CURRENT_DIR}
endif

push-images: verify-tag-name
ifeq ($(strip $(quay_repository)),)
$(info No registry information provide. To push images to a docker registry please set)
$(info No registry information provided. To push images to a docker registry please set)
$(info environment variables: quay_repository, quay_token, and quay_id. Environment)
$(info variables do not need to be set for github Travis CICD.)
else
Expand Down Expand Up @@ -106,14 +125,6 @@ else
hack/run-e2e-kind.sh ${quay_repository}/mcad-controller ${TAG}
endif


# Build the controller executable for use on the local host and using local build args
# the default for local build args is `-race` to turn race detection, this is not to be used
# inside the docker containers.
mcad-controller-local: init generate-code
$(info Compiling controller)
go build ${LOCAL_BUILD_ARGS} -o ${BIN_DIR}/mcad-controller-local ./cmd/kar-controllers/

coverage:
# KUBE_COVER=y hack/make-rules/test.sh $(WHAT) $(TESTS)

Expand Down
69 changes: 31 additions & 38 deletions doc/build/build.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# Multi-Cluster-App-Dispatcher Controller Build Instructions

This document will show how to build the `Multi-Cluster-App-Deployer` (`MCAD`) Kubernetes Controller that operates on an `AppWrapper` kubernetes custom resource definition. Instructions are for the [master](https://github.com/IBM/multi-cluster-app-dispatcher/tree/master) branch.
This document will show how to build the `Multi-Cluster-App-Dispatcher` (`MCAD`) Kubernetes Controller that operates on an `AppWrapper` kubernetes custom resource definition. Instructions are for the [main](https://github.com/project-codeflare/multi-cluster-app-dispatcher/tree/main) branch.

## 1. Pre-condition

### Docker Environment

To build `Multi-Cluster-App-Deployer`, a running Docker environment must be available. Here is a document on [Getting Started with Docker](https://www.docker.com/get-started).
To build `Multi-Cluster-App-Dispatcher`, a running Docker environment must be available. Here is a document on [Getting Started with Docker](https://www.docker.com/get-started). Podman image builds are also supported.

### Clone Multi-Cluster-App-Deployer Git Repo

Expand Down Expand Up @@ -36,8 +36,8 @@ $

To build the controller and to run the end to end tests locally you will need to have the following software installed:

* `Go` (version 1.16) -- the controller will compile and run with later versions, but currently supported version is 1.16
* `kind` (version 0.11) -- later versions will work fine
* `Go` (version 1.18)
* `kind` (version 0.18)
* `kubectl`
* `helm` - version 3.0 or later
* `make`
Expand All @@ -56,30 +56,25 @@ To to build the executable, execute:
#build for linux OS and for use inside docker image
multi-cluster-app-dispatcher $ make mcad-controller
...
Compiling deepcopy-gen...
Generating deepcopy...
go build -o _output/bin/deepcopy-gen ./cmd/deepcopy-gen/
_output/bin/deepcopy-gen -i ./pkg/apis/controller/v1beta1/ -O zz_generated.deepcopy
Compiling controller
CGO_ENABLED=0 GOOS="linux" go build -o _output/bin/mcad-controller ./cmd/kar-controllers/

#build for local testing purposes, by default enable the race conditions detector
multi-cluster-app-dispatcher $ make mcad-controller-local
...
mkdir -p _output/bin
Compiling deepcopy-gen...
Generating deepcopy...
go build -o _output/bin/deepcopy-gen ./cmd/deepcopy-gen/
_output/bin/deepcopy-gen -i ./pkg/apis/controller/v1beta1/ -O zz_generated.deepcopy
Compiling controller
go build -race -o _output/bin/mcad-controller-local ./cmd/kar-controllers/
CGO_ENABLED=0 go build -o _output/bin/mcad-controller ./cmd/kar-controllers/
```

Ensure the executables: `deepcopy-gen` and `mcad-controllers` are created in the target output directory:
Ensure the executable `mcad-controllers` are created in the target output directory:

```bash
multi-cluster-app-dispatcher $ ls _output/bin
deepcopy-gen mcad-controller
mcad-controller
```

If you want pass additional args to the `go build`, define add them to the `GO_BUILD_ARGS` environment variable. This feature is useful if you want to compile the executable with the race condition detector turned on. To turn on the the race detector in your executable, execute:

```bash
make mcad-controller GO_BUILD_ARGS=-race
mkdir -p _output/bin
Compiling controller with build arguments: '-race'
go build -race -o _output/bin/mcad-controller ./cmd/kar-controllers/
```

### Build the Multi-Cluster-App-Dispatcher Image
Expand All @@ -92,22 +87,22 @@ From the root directory of the repository:
# With docker daemon running
multi-cluster-app-dispatcher % make images
....
# output from main branch, MacOS build, local file names replaced with XXXXXXXXXX
make images
"---"
"MAKE GLOBAL VARIABLES:"
" "BIN_DIR="_output/bin"
" "GIT_BRANCH="main"
" "RELEASE_VER="v1.29.55"
" "TAG="main-v1.29.55"
" "RELEASE_VER="v1.29.57"
" "TAG="main-v1.29.57"
" "GO_BUILD_ARGS=""
"---"
# Check for invalid tag name
t=main-v1.29.55 && [ ${#t} -le 128 ] || { echo "Target name $t has 128 or more chars"; false; }
t=main-v1.29.57 && [ ${#t} -le 128 ] || { echo "Target name $t has 128 or more chars"; false; }
List executable directory
repo id:
branch: main
Build the docker image
docker build --quiet --no-cache --tag mcad-controller:main-v1.29.55 -f XXXXXXXXXX/multi-cluster-app-dispatcher/Dockerfile XXXXXXXXXX/multi-cluster-app-dispatcher
sha256:6871c150701280abc29baa14aa639791cefb9ba4b61177ab4faf5a43bdfcc4e4
docker build --quiet --no-cache --tag mcad-controller:main-v1.29.57 -f XXXXXX/multi-cluster-app-dispatcher/Dockerfile XXXXX/multi-cluster-app-dispatcher

#Using podman
make images-podman
Expand All @@ -117,37 +112,35 @@ make images-podman
"MAKE GLOBAL VARIABLES:"
" "BIN_DIR="_output/bin"
" "GIT_BRANCH="main"
" "RELEASE_VER="v1.29.55"
" "TAG="main-v1.29.55"
" "RELEASE_VER="v1.29.57"
" "TAG="main-v1.29.57"
" "GO_BUILD_ARGS=""
"---"
# Check for invalid tag name
t=main-v1.29.55 && [ ${#t} -le 128 ] || { echo "Target name $t has 128 or more chars"; false; }
t=main-v1.29.57 && [ ${#t} -le 128 ] || { echo "Target name $t has 128 or more chars"; false; }
List executable directory
repo id:
branch: main
Build the docker image
ls -l XXXXXXXXXX/multi-cluster-app-dispatcher/_output/bin
total 130144
-rwxr-xr-x 1 laurentiu.bradin staff 8238498 Apr 6 15:19 deepcopy-gen
-rwxr-xr-x 1 laurentiu.bradin staff 58391090 Apr 6 15:19 mcad-controller
podman build --quiet --no-cache --tag mcad-controller:issue_315_small_changes-v1.29.55 -f XXXXXXXXXX/multi-cluster-app-dispatcher/Dockerfile XXXXXXXXXX/multi-cluster-app-dispatcher
f784707e8982399ef7ef66e3d8a09b669e6deb17990d174400338813fb13c505
podman build --quiet --no-cache --tag mcad-controller:main-v1.29.57 -f XXXXX/multi-cluster-app-dispatcher/Dockerfile XXXXX/multi-cluster-app-dispatcher
```

The `GO_BUILD_ARGS` use is also supported by the images builds with either `docker` and `podman`. To turn on the race condition detector in image's executable execute: `make images GO_BUILD_ARGS=-race`

### Push the Multi-Cluster-App-Dispatcher Image to an Image Repository

The following example assumes an available `<repository>/mcad-controller` on [Docker Hub](https://hub.docker.com) and using image version `v1.14`

```bash
docker login
docker push <respository>/mcad-controller:v1.14
docker push <repository>/mcad-controller:v1.14
```

The same can be done with [Quay](quay.io)

```bash
docker login quay.io
docker push <quay_respository>/mcad-controller:v1.14
docker push <quay_repository>/mcad-controller:v1.14
```

Refer to [deployment](../deploy/deployment.md) on how to deploy the `multi-cluster-app-dispatcher` as a controller in Kubernetes.
Expand Down
11 changes: 8 additions & 3 deletions hack/run-e2e-kind.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
# limitations under the License.

export ROOT_DIR="$(dirname "$(dirname "$(readlink -fn "$0")")")"
export LOG_LEVEL=3
export LOG_LEVEL=${TEST_LOG_LEVEL:-2}
export CLEANUP_CLUSTER=${CLEANUP_CLUSTER:-"true"}
export CLUSTER_CONTEXT="--name test"
# Using older image due to older version of kubernetes cluster"
Expand Down Expand Up @@ -334,7 +334,7 @@ function kube-test-env-up {
# start mcad controller
echo "Starting MCAD Controller..."
echo "helm install mcad-controller namespace kube-system wait set loglevel=2 set resources.requests.cpu=1000m set resources.requests.memory=1024Mi set resources.limits.cpu=4000m set resources.limits.memory=4096Mi set image.repository=$IMAGE_REPOSITORY_MCAD set image.tag=$IMAGE_TAG_MCAD set image.pullPolicy=$MCAD_IMAGE_PULL_POLICY"
helm upgrade --install mcad-controller ${ROOT_DIR}/deployment/mcad-controller --namespace kube-system --wait --set loglevel=2 --set resources.requests.cpu=1000m --set resources.requests.memory=1024Mi --set resources.limits.cpu=4000m --set resources.limits.memory=4096Mi --set configMap.name=mcad-controller-configmap --set configMap.podCreationTimeout='"120000"' --set configMap.quotaEnabled='"false"' --set coscheduler.rbac.apiGroup=scheduling.sigs.k8s.io --set coscheduler.rbac.resource=podgroups --set image.repository=$IMAGE_REPOSITORY_MCAD --set image.tag=$IMAGE_TAG_MCAD --set image.pullPolicy=$MCAD_IMAGE_PULL_POLICY
helm upgrade --install mcad-controller ${ROOT_DIR}/deployment/mcad-controller --namespace kube-system --wait --set loglevel=${LOG_LEVEL} --set resources.requests.cpu=1000m --set resources.requests.memory=1024Mi --set resources.limits.cpu=4000m --set resources.limits.memory=4096Mi --set configMap.name=mcad-controller-configmap --set configMap.podCreationTimeout='"120000"' --set configMap.quotaEnabled='"false"' --set coscheduler.rbac.apiGroup=scheduling.sigs.k8s.io --set coscheduler.rbac.resource=podgroups --set image.repository=$IMAGE_REPOSITORY_MCAD --set image.tag=$IMAGE_TAG_MCAD --set image.pullPolicy=$MCAD_IMAGE_PULL_POLICY

sleep 10
echo "Listing MCAD Controller Helm Chart and Pod YAML..."
Expand Down Expand Up @@ -362,6 +362,11 @@ function kube-test-env-up {
done
echo "kubectl uncordon test-worker"
kubectl uncordon test-worker
echo "Waiting for pod in the kube-system namespace to become ready"
while [[ $(kubectl get pods -n kube-system -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' | tr ' ' '\n' | sort -u) != "True" ]]
do
echo -n "." && sleep 1;
done

# Show available resources of cluster nodes
echo "---"
Expand All @@ -385,4 +390,4 @@ kind-up-cluster
kube-test-env-up

echo "==========================>>>>> Running E2E tests... <<<<<=========================="
go test ./test/e2e -v -timeout 75m
go test ./test/e2e -v -timeout 75m -count=1
54 changes: 54 additions & 0 deletions pkg/controller/queuejob/active_appwrapper.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
package queuejob

import (
"strings"
"sync"

arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1"
)

// ActiveAppWrapper is current scheduling AppWrapper in the XController struct.
// Its sole purpose is provide a thread safe way to for use the XController logic
type ActiveAppWrapper struct {
activeAW *arbv1.AppWrapper
activeAWMutex *sync.RWMutex
}

// NewActiveAppWrapper
func NewActiveAppWrapper() *ActiveAppWrapper {
return &ActiveAppWrapper{
activeAW: nil,
activeAWMutex: &sync.RWMutex{},
}
}

// AtomicSet as is name implies, atomically sets the activeAW to the new value
func (aw *ActiveAppWrapper) AtomicSet(newValue *arbv1.AppWrapper) {
aw.activeAWMutex.Lock()
defer aw.activeAWMutex.Unlock()
aw.activeAW = newValue
}

// IsActiveAppWrapper safely performs the comparison that was done inside the if block
// at line 1977 in the queuejob_controller_ex.go
// The code looked like this:
//
// if !qj.Status.CanRun && qj.Status.State == arbv1.AppWrapperStateEnqueued &&
// !cc.qjqueue.IfExistUnschedulableQ(qj) && !cc.qjqueue.IfExistActiveQ(qj) {
// // One more check to ensure AW is not the current active schedule object
// if cc.schedulingAW == nil ||
// (strings.Compare(cc.schedulingAW.Namespace, qj.Namespace) != 0 &&
// strings.Compare(cc.schedulingAW.Name, qj.Name) != 0) {
// cc.qjqueue.AddIfNotPresent(qj)
// klog.V(3).Infof("[manageQueueJob] Recovered AppWrapper %s%s - added to active queue, Status=%+v",
// qj.Namespace, qj.Name, qj.Status)
// return nil
// }
// }
func (aw *ActiveAppWrapper) IsActiveAppWrapper(name, namespace string) bool {
aw.activeAWMutex.RLock()
defer aw.activeAWMutex.RUnlock()
return aw.activeAW == nil ||
(strings.Compare(aw.activeAW.Namespace, namespace) != 0 &&
strings.Compare(aw.activeAW.Name, name) != 0)
}
Loading