diff --git a/.gitignore b/.gitignore index 67cd6f938..2b87a7874 100644 --- a/.gitignore +++ b/.gitignore @@ -125,3 +125,6 @@ kubernetes.tar.gz /bazel-* *.pyc +# .devcontainer files +.devcontainer + diff --git a/.travis.yml b/.travis.yml index c796a7d34..6928d77e3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,7 +17,7 @@ install: [] before_script: - - export TEST_LOG_LEVEL=4 + - export TEST_LOG_LEVEL=2 script: - BLUE='\033[34m' diff --git a/Dockerfile b/Dockerfile index 312e360bd..47acff9e3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,7 @@ -FROM registry.access.redhat.com/ubi8/go-toolset:1.18.10-1 AS BUILDER -USER root +FROM registry.access.redhat.com/ubi8/go-toolset:1.18.10-1 AS BUILDER +ARG GO_BUILD_ARGS WORKDIR /workdir +USER root COPY Makefile Makefile COPY go.mod go.mod @@ -10,8 +11,10 @@ COPY pkg pkg COPY hack hack COPY CONTROLLER_VERSION CONTROLLER_VERSION -RUN cd /workdir && curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl && chmod +x kubectl -RUN make mcad-controller +RUN cd /workdir && curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/$(go env GOARCH)/kubectl && chmod +x kubectl +ENV GO_BUILD_ARGS=$GO_BUILD_ARGS +RUN echo "Go build args: $GO_BUILD_ARGS" && \ + make mcad-controller FROM registry.access.redhat.com/ubi8/ubi-minimal:latest diff --git a/Makefile b/Makefile index 10ffd8fb0..4d2c2ded7 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,9 @@ VERSION_FILE=./CONTROLLER_VERSION RELEASE_VER=v$(shell $(CAT_CMD) $(VERSION_FILE)) CURRENT_DIR=$(shell pwd) GIT_BRANCH:=$(shell git symbolic-ref --short HEAD 2>&1 | grep -v fatal) -LOCAL_BUILD_ARGS ?= -race +#define the GO_BUILD_ARGS if you need to pass additional arguments to the go build +GO_BUILD_ARGS?= + # Reset branch name if this a Travis CI environment ifneq ($(strip $(TRAVIS_BRANCH)),) GIT_BRANCH:=${TRAVIS_BRANCH} @@ -29,8 +31,13 @@ TAG:=${TAG}${RELEASE_VER} # Build the controler executalbe for use in docker image build mcad-controller: init generate-code +ifeq ($(strip $(GO_BUILD_ARGS)),) $(info Compiling controller) CGO_ENABLED=0 go build -o ${BIN_DIR}/mcad-controller ./cmd/kar-controllers/ +else + $(info Compiling controller with build arguments: '${GO_BUILD_ARGS}') + go build $(GO_BUILD_ARGS) -o ${BIN_DIR}/mcad-controller ./cmd/kar-controllers/ +endif print-global-variables: $(info "---") @@ -39,6 +46,7 @@ print-global-variables: $(info " "GIT_BRANCH="$(GIT_BRANCH)") $(info " "RELEASE_VER="$(RELEASE_VER)") $(info " "TAG="$(TAG)") + $(info " "GO_BUILD_ARGS="$(GO_BUILD_ARGS)") $(info "---") verify: generate-code @@ -53,30 +61,41 @@ verify-tag-name: print-global-variables # Check for invalid tag name t=${TAG} && [ $${#t} -le 128 ] || { echo "Target name $$t has 128 or more chars"; false; } -generate-code: - $(info Compiling deepcopy-gen...) - go build -o ${BIN_DIR}/deepcopy-gen ./cmd/deepcopy-gen/ +generate-code: pkg/apis/controller/v1beta1/zz_generated.deepcopy.go + +pkg/apis/controller/v1beta1/zz_generated.deepcopy.go: ${BIN_DIR}/deepcopy-gen $(info Generating deepcopy...) ${BIN_DIR}/deepcopy-gen -i ./pkg/apis/controller/v1beta1/ -O zz_generated.deepcopy -images: verify-tag-name +${BIN_DIR}/deepcopy-gen: + $(info Compiling deepcopy-gen...) + go build -o ${BIN_DIR}/deepcopy-gen ./cmd/deepcopy-gen/ + +images: verify-tag-name generate-code $(info List executable directory) $(info repo id: ${git_repository_id}) $(info branch: ${GIT_BRANCH}) $(info Build the docker image) +ifeq ($(strip $(GO_BUILD_ARGS)),) docker build --quiet --no-cache --tag mcad-controller:${TAG} -f ${CURRENT_DIR}/Dockerfile ${CURRENT_DIR} +else + docker build --no-cache --tag mcad-controller:${TAG} --build-arg GO_BUILD_ARGS=$(GO_BUILD_ARGS) -f ${CURRENT_DIR}/Dockerfile ${CURRENT_DIR} +endif -images-podman: verify-tag-name +images-podman: verify-tag-name generate-code $(info List executable directory) $(info repo id: ${git_repository_id}) $(info branch: ${GIT_BRANCH}) - ls -l ${CURRENT_DIR}/_output/bin $(info Build the docker image) +ifeq ($(strip $(GO_BUILD_ARGS)),) podman build --quiet --no-cache --tag mcad-controller:${TAG} -f ${CURRENT_DIR}/Dockerfile ${CURRENT_DIR} +else + podman build --no-cache --tag mcad-controller:${TAG} --build-arg GO_BUILD_ARGS=$(GO_BUILD_ARGS) -f ${CURRENT_DIR}/Dockerfile ${CURRENT_DIR} +endif push-images: verify-tag-name ifeq ($(strip $(quay_repository)),) - $(info No registry information provide. To push images to a docker registry please set) + $(info No registry information provided. To push images to a docker registry please set) $(info environment variables: quay_repository, quay_token, and quay_id. Environment) $(info variables do not need to be set for github Travis CICD.) else @@ -106,14 +125,6 @@ else hack/run-e2e-kind.sh ${quay_repository}/mcad-controller ${TAG} endif - -# Build the controller executable for use on the local host and using local build args -# the default for local build args is `-race` to turn race detection, this is not to be used -# inside the docker containers. -mcad-controller-local: init generate-code - $(info Compiling controller) - go build ${LOCAL_BUILD_ARGS} -o ${BIN_DIR}/mcad-controller-local ./cmd/kar-controllers/ - coverage: # KUBE_COVER=y hack/make-rules/test.sh $(WHAT) $(TESTS) diff --git a/doc/build/build.md b/doc/build/build.md index d759a92d5..75629fe34 100644 --- a/doc/build/build.md +++ b/doc/build/build.md @@ -1,12 +1,12 @@ # Multi-Cluster-App-Dispatcher Controller Build Instructions -This document will show how to build the `Multi-Cluster-App-Deployer` (`MCAD`) Kubernetes Controller that operates on an `AppWrapper` kubernetes custom resource definition. Instructions are for the [master](https://github.com/IBM/multi-cluster-app-dispatcher/tree/master) branch. +This document will show how to build the `Multi-Cluster-App-Dispatcher` (`MCAD`) Kubernetes Controller that operates on an `AppWrapper` kubernetes custom resource definition. Instructions are for the [main](https://github.com/project-codeflare/multi-cluster-app-dispatcher/tree/main) branch. ## 1. Pre-condition ### Docker Environment -To build `Multi-Cluster-App-Deployer`, a running Docker environment must be available. Here is a document on [Getting Started with Docker](https://www.docker.com/get-started). +To build `Multi-Cluster-App-Dispatcher`, a running Docker environment must be available. Here is a document on [Getting Started with Docker](https://www.docker.com/get-started). Podman image builds are also supported. ### Clone Multi-Cluster-App-Deployer Git Repo @@ -36,8 +36,8 @@ $ To build the controller and to run the end to end tests locally you will need to have the following software installed: -* `Go` (version 1.16) -- the controller will compile and run with later versions, but currently supported version is 1.16 -* `kind` (version 0.11) -- later versions will work fine +* `Go` (version 1.18) +* `kind` (version 0.18) * `kubectl` * `helm` - version 3.0 or later * `make` @@ -56,30 +56,25 @@ To to build the executable, execute: #build for linux OS and for use inside docker image multi-cluster-app-dispatcher $ make mcad-controller ... -Compiling deepcopy-gen... -Generating deepcopy... -go build -o _output/bin/deepcopy-gen ./cmd/deepcopy-gen/ -_output/bin/deepcopy-gen -i ./pkg/apis/controller/v1beta1/ -O zz_generated.deepcopy -Compiling controller -CGO_ENABLED=0 GOOS="linux" go build -o _output/bin/mcad-controller ./cmd/kar-controllers/ - -#build for local testing purposes, by default enable the race conditions detector -multi-cluster-app-dispatcher $ make mcad-controller-local -... mkdir -p _output/bin -Compiling deepcopy-gen... -Generating deepcopy... -go build -o _output/bin/deepcopy-gen ./cmd/deepcopy-gen/ -_output/bin/deepcopy-gen -i ./pkg/apis/controller/v1beta1/ -O zz_generated.deepcopy Compiling controller -go build -race -o _output/bin/mcad-controller-local ./cmd/kar-controllers/ +CGO_ENABLED=0 go build -o _output/bin/mcad-controller ./cmd/kar-controllers/ ``` -Ensure the executables: `deepcopy-gen` and `mcad-controllers` are created in the target output directory: +Ensure the executable `mcad-controllers` are created in the target output directory: ```bash multi-cluster-app-dispatcher $ ls _output/bin -deepcopy-gen mcad-controller +mcad-controller +``` + +If you want pass additional args to the `go build`, define add them to the `GO_BUILD_ARGS` environment variable. This feature is useful if you want to compile the executable with the race condition detector turned on. To turn on the the race detector in your executable, execute: + +```bash +make mcad-controller GO_BUILD_ARGS=-race +mkdir -p _output/bin +Compiling controller with build arguments: '-race' +go build -race -o _output/bin/mcad-controller ./cmd/kar-controllers/ ``` ### Build the Multi-Cluster-App-Dispatcher Image @@ -92,22 +87,22 @@ From the root directory of the repository: # With docker daemon running multi-cluster-app-dispatcher % make images .... -# output from main branch, MacOS build, local file names replaced with XXXXXXXXXX +make images "---" "MAKE GLOBAL VARIABLES:" " "BIN_DIR="_output/bin" " "GIT_BRANCH="main" -" "RELEASE_VER="v1.29.55" -" "TAG="main-v1.29.55" +" "RELEASE_VER="v1.29.57" +" "TAG="main-v1.29.57" +" "GO_BUILD_ARGS="" "---" # Check for invalid tag name -t=main-v1.29.55 && [ ${#t} -le 128 ] || { echo "Target name $t has 128 or more chars"; false; } +t=main-v1.29.57 && [ ${#t} -le 128 ] || { echo "Target name $t has 128 or more chars"; false; } List executable directory repo id: branch: main Build the docker image -docker build --quiet --no-cache --tag mcad-controller:main-v1.29.55 -f XXXXXXXXXX/multi-cluster-app-dispatcher/Dockerfile XXXXXXXXXX/multi-cluster-app-dispatcher -sha256:6871c150701280abc29baa14aa639791cefb9ba4b61177ab4faf5a43bdfcc4e4 +docker build --quiet --no-cache --tag mcad-controller:main-v1.29.57 -f XXXXXX/multi-cluster-app-dispatcher/Dockerfile XXXXX/multi-cluster-app-dispatcher #Using podman make images-podman @@ -117,37 +112,35 @@ make images-podman "MAKE GLOBAL VARIABLES:" " "BIN_DIR="_output/bin" " "GIT_BRANCH="main" -" "RELEASE_VER="v1.29.55" -" "TAG="main-v1.29.55" +" "RELEASE_VER="v1.29.57" +" "TAG="main-v1.29.57" +" "GO_BUILD_ARGS="" "---" # Check for invalid tag name -t=main-v1.29.55 && [ ${#t} -le 128 ] || { echo "Target name $t has 128 or more chars"; false; } +t=main-v1.29.57 && [ ${#t} -le 128 ] || { echo "Target name $t has 128 or more chars"; false; } List executable directory repo id: branch: main Build the docker image -ls -l XXXXXXXXXX/multi-cluster-app-dispatcher/_output/bin -total 130144 --rwxr-xr-x 1 laurentiu.bradin staff 8238498 Apr 6 15:19 deepcopy-gen --rwxr-xr-x 1 laurentiu.bradin staff 58391090 Apr 6 15:19 mcad-controller -podman build --quiet --no-cache --tag mcad-controller:issue_315_small_changes-v1.29.55 -f XXXXXXXXXX/multi-cluster-app-dispatcher/Dockerfile XXXXXXXXXX/multi-cluster-app-dispatcher -f784707e8982399ef7ef66e3d8a09b669e6deb17990d174400338813fb13c505 +podman build --quiet --no-cache --tag mcad-controller:main-v1.29.57 -f XXXXX/multi-cluster-app-dispatcher/Dockerfile XXXXX/multi-cluster-app-dispatcher ``` +The `GO_BUILD_ARGS` use is also supported by the images builds with either `docker` and `podman`. To turn on the race condition detector in image's executable execute: `make images GO_BUILD_ARGS=-race` + ### Push the Multi-Cluster-App-Dispatcher Image to an Image Repository The following example assumes an available `/mcad-controller` on [Docker Hub](https://hub.docker.com) and using image version `v1.14` ```bash docker login -docker push /mcad-controller:v1.14 +docker push /mcad-controller:v1.14 ``` The same can be done with [Quay](quay.io) ```bash docker login quay.io -docker push /mcad-controller:v1.14 +docker push /mcad-controller:v1.14 ``` Refer to [deployment](../deploy/deployment.md) on how to deploy the `multi-cluster-app-dispatcher` as a controller in Kubernetes. diff --git a/hack/run-e2e-kind.sh b/hack/run-e2e-kind.sh index 2fbc101f4..f26daf8ec 100755 --- a/hack/run-e2e-kind.sh +++ b/hack/run-e2e-kind.sh @@ -28,7 +28,7 @@ # limitations under the License. export ROOT_DIR="$(dirname "$(dirname "$(readlink -fn "$0")")")" -export LOG_LEVEL=3 +export LOG_LEVEL=${TEST_LOG_LEVEL:-2} export CLEANUP_CLUSTER=${CLEANUP_CLUSTER:-"true"} export CLUSTER_CONTEXT="--name test" # Using older image due to older version of kubernetes cluster" @@ -334,7 +334,7 @@ function kube-test-env-up { # start mcad controller echo "Starting MCAD Controller..." echo "helm install mcad-controller namespace kube-system wait set loglevel=2 set resources.requests.cpu=1000m set resources.requests.memory=1024Mi set resources.limits.cpu=4000m set resources.limits.memory=4096Mi set image.repository=$IMAGE_REPOSITORY_MCAD set image.tag=$IMAGE_TAG_MCAD set image.pullPolicy=$MCAD_IMAGE_PULL_POLICY" - helm upgrade --install mcad-controller ${ROOT_DIR}/deployment/mcad-controller --namespace kube-system --wait --set loglevel=2 --set resources.requests.cpu=1000m --set resources.requests.memory=1024Mi --set resources.limits.cpu=4000m --set resources.limits.memory=4096Mi --set configMap.name=mcad-controller-configmap --set configMap.podCreationTimeout='"120000"' --set configMap.quotaEnabled='"false"' --set coscheduler.rbac.apiGroup=scheduling.sigs.k8s.io --set coscheduler.rbac.resource=podgroups --set image.repository=$IMAGE_REPOSITORY_MCAD --set image.tag=$IMAGE_TAG_MCAD --set image.pullPolicy=$MCAD_IMAGE_PULL_POLICY + helm upgrade --install mcad-controller ${ROOT_DIR}/deployment/mcad-controller --namespace kube-system --wait --set loglevel=${LOG_LEVEL} --set resources.requests.cpu=1000m --set resources.requests.memory=1024Mi --set resources.limits.cpu=4000m --set resources.limits.memory=4096Mi --set configMap.name=mcad-controller-configmap --set configMap.podCreationTimeout='"120000"' --set configMap.quotaEnabled='"false"' --set coscheduler.rbac.apiGroup=scheduling.sigs.k8s.io --set coscheduler.rbac.resource=podgroups --set image.repository=$IMAGE_REPOSITORY_MCAD --set image.tag=$IMAGE_TAG_MCAD --set image.pullPolicy=$MCAD_IMAGE_PULL_POLICY sleep 10 echo "Listing MCAD Controller Helm Chart and Pod YAML..." @@ -362,6 +362,11 @@ function kube-test-env-up { done echo "kubectl uncordon test-worker" kubectl uncordon test-worker + echo "Waiting for pod in the kube-system namespace to become ready" + while [[ $(kubectl get pods -n kube-system -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' | tr ' ' '\n' | sort -u) != "True" ]] + do + echo -n "." && sleep 1; + done # Show available resources of cluster nodes echo "---" @@ -385,4 +390,4 @@ kind-up-cluster kube-test-env-up echo "==========================>>>>> Running E2E tests... <<<<<==========================" -go test ./test/e2e -v -timeout 75m \ No newline at end of file +go test ./test/e2e -v -timeout 75m -count=1 \ No newline at end of file diff --git a/pkg/controller/queuejob/active_appwrapper.go b/pkg/controller/queuejob/active_appwrapper.go new file mode 100644 index 000000000..1aabf70b9 --- /dev/null +++ b/pkg/controller/queuejob/active_appwrapper.go @@ -0,0 +1,54 @@ +package queuejob + +import ( + "strings" + "sync" + + arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" +) + +// ActiveAppWrapper is current scheduling AppWrapper in the XController struct. +// Its sole purpose is provide a thread safe way to for use the XController logic +type ActiveAppWrapper struct { + activeAW *arbv1.AppWrapper + activeAWMutex *sync.RWMutex +} + +// NewActiveAppWrapper +func NewActiveAppWrapper() *ActiveAppWrapper { + return &ActiveAppWrapper{ + activeAW: nil, + activeAWMutex: &sync.RWMutex{}, + } +} + +// AtomicSet as is name implies, atomically sets the activeAW to the new value +func (aw *ActiveAppWrapper) AtomicSet(newValue *arbv1.AppWrapper) { + aw.activeAWMutex.Lock() + defer aw.activeAWMutex.Unlock() + aw.activeAW = newValue +} + +// IsActiveAppWrapper safely performs the comparison that was done inside the if block +// at line 1977 in the queuejob_controller_ex.go +// The code looked like this: +// +// if !qj.Status.CanRun && qj.Status.State == arbv1.AppWrapperStateEnqueued && +// !cc.qjqueue.IfExistUnschedulableQ(qj) && !cc.qjqueue.IfExistActiveQ(qj) { +// // One more check to ensure AW is not the current active schedule object +// if cc.schedulingAW == nil || +// (strings.Compare(cc.schedulingAW.Namespace, qj.Namespace) != 0 && +// strings.Compare(cc.schedulingAW.Name, qj.Name) != 0) { +// cc.qjqueue.AddIfNotPresent(qj) +// klog.V(3).Infof("[manageQueueJob] Recovered AppWrapper %s%s - added to active queue, Status=%+v", +// qj.Namespace, qj.Name, qj.Status) +// return nil +// } +// } +func (aw *ActiveAppWrapper) IsActiveAppWrapper(name, namespace string) bool { + aw.activeAWMutex.RLock() + defer aw.activeAWMutex.RUnlock() + return aw.activeAW == nil || + (strings.Compare(aw.activeAW.Namespace, namespace) != 0 && + strings.Compare(aw.activeAW.Name, name) != 0) +} diff --git a/pkg/controller/queuejob/active_appwrapper_test.go b/pkg/controller/queuejob/active_appwrapper_test.go new file mode 100644 index 000000000..250ac1798 --- /dev/null +++ b/pkg/controller/queuejob/active_appwrapper_test.go @@ -0,0 +1,43 @@ +package queuejob_test + +import ( + "github.com/onsi/ginkgo" + "github.com/onsi/gomega" + arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" + "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejob" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +var _ = ginkgo.Describe("Active App Wrapper Tests", func() { + var activeAppWrapper *queuejob.ActiveAppWrapper + ginkgo.When("Checking if is active app wrapper", func() { + ginkgo.BeforeEach(func() { + activeAppWrapper = queuejob.NewActiveAppWrapper() + }) + ginkgo.It("should return 'true' for a nil active app wrapper", func() { + gomega.Expect(activeAppWrapper.IsActiveAppWrapper("an-appwrapper-name", "unit-test-namespace"), + gomega.BeTrue()) + }) + ginkgo.It("should return 'true' for a the same app wrapper name and namespace", func() { + activeAppWrapper.AtomicSet(&arbv1.AppWrapper{ + ObjectMeta: v1.ObjectMeta{ + Name: "an-appwrapper-name", + Namespace: "unit-test-namespace", + }, + }) + gomega.Expect(activeAppWrapper.IsActiveAppWrapper("an-appwrapper-name", "unit-test-namespace"), + gomega.BeTrue()) + }) + ginkgo.It("should return 'false' for a the same app wrapper name and namespace", func() { + activeAppWrapper.AtomicSet(&arbv1.AppWrapper{ + ObjectMeta: v1.ObjectMeta{ + Name: "an-appwrapper-name", + Namespace: "unit-test-namespace", + }, + }) + gomega.Expect(activeAppWrapper.IsActiveAppWrapper("another-appwrapper-name", "other-unit-test-namespace"), + gomega.BeTrue()) + }) + }) + +}) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 9c36ac0df..6e7e82e66 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -165,7 +165,7 @@ type XController struct { quotaManager quota.QuotaManagerInterface // Active Scheduling AppWrapper - schedulingAW *arbv1.AppWrapper + schedulingAW *ActiveAppWrapper } type JobAndClusterAgent struct { @@ -224,6 +224,7 @@ func NewJobController(config *rest.Config, serverOption *options.ServerOption) * updateQueue: cache.NewFIFO(GetQueueJobKey), qjqueue: NewSchedulingQueue(), cache: clusterstatecache.New(config), + schedulingAW: NewActiveAppWrapper(), } cc.metricsAdapter = adapter.New(serverOption, config, cc.cache) @@ -416,8 +417,6 @@ func NewJobController(config *rest.Config, serverOption *options.ServerOption) * //create (empty) dispatchMap cc.dispatchMap = map[string]string{} - // Initialize current scheuling active AppWrapper - cc.schedulingAW = nil return cc } @@ -1098,7 +1097,6 @@ func (qjm *XController) ScheduleNext() { // if we have enough compute resources then we set the AllocatedReplicas to the total // amount of resources asked by the job qj, err := qjm.qjqueue.Pop() - qjm.schedulingAW = qj if err != nil { klog.V(3).Infof("[ScheduleNext] Cannot pop QueueJob from qjqueue! err=%#v", err) return // Try to pop qjqueue again @@ -1118,6 +1116,7 @@ func (qjm *XController) ScheduleNext() { klog.V(10).Infof("[ScheduleNext] %s found more recent copy from cache &apiQueueJob=%p apiQueueJob=%+v", apiCacheAWJob.Name, apiCacheAWJob, apiCacheAWJob) apiCacheAWJob.DeepCopyInto(qj) } + qjm.schedulingAW.AtomicSet(qj) // Re-compute SystemPriority for DynamicPriority policy if qjm.serverOption.DynamicPriority { @@ -1149,12 +1148,12 @@ func (qjm *XController) ScheduleNext() { // Retrieve HeadOfLine after priority update qj, err = qjm.qjqueue.Pop() - qjm.schedulingAW = qj if err != nil { klog.V(3).Infof("[ScheduleNext] Cannot pop QueueJob from qjqueue! err=%#v", err) } else { klog.V(3).Infof("[ScheduleNext] activeQ.Pop_afterPriorityUpdate %s *Delay=%.6f seconds RemainingLength=%d &qj=%p Version=%s Status=%+v", qj.Name, time.Now().Sub(qj.Status.ControllerFirstTimestamp.Time).Seconds(), qjm.qjqueue.Length(), qj, qj.ResourceVersion, qj.Status) } + qjm.schedulingAW.AtomicSet(qj) } if qj.Status.CanRun { @@ -1620,7 +1619,7 @@ func (cc *XController) addQueueJob(obj interface{}) { qj.Status.SystemPriority = float64(qj.Spec.Priority) qj.Status.QueueJobState = arbv1.AppWrapperCondInit qj.Status.Conditions = []arbv1.AppWrapperCondition{ - arbv1.AppWrapperCondition{ + { Type: arbv1.AppWrapperCondInit, Status: v1.ConditionTrue, LastUpdateMicroTime: metav1.NowMicro(), @@ -1979,9 +1978,7 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool if !qj.Status.CanRun && qj.Status.State == arbv1.AppWrapperStateEnqueued && !cc.qjqueue.IfExistUnschedulableQ(qj) && !cc.qjqueue.IfExistActiveQ(qj) { // One more check to ensure AW is not the current active schedule object - if cc.schedulingAW == nil || - (strings.Compare(cc.schedulingAW.Namespace, qj.Namespace) != 0 && - strings.Compare(cc.schedulingAW.Name, qj.Name) != 0) { + if cc.schedulingAW.IsActiveAppWrapper(qj.Name, qj.Namespace) { cc.qjqueue.AddIfNotPresent(qj) klog.V(3).Infof("[manageQueueJob] Recovered AppWrapper %s%s - added to active queue, Status=%+v", qj.Namespace, qj.Name, qj.Status) @@ -2233,9 +2230,11 @@ func (cc *XController) Cleanup(appwrapper *arbv1.AppWrapper) error { for _, ar := range appwrapper.Spec.AggrResources.GenericItems { genericResourceName, gvk, err00 := cc.genericresources.Cleanup(appwrapper, &ar) if err00 != nil { - klog.Errorf("[Cleanup] Error deleting generic item %s, GVK=%s.%s.%s from job=%s Status=%+v err=%+v.", - genericResourceName, gvk.Group, gvk.Version, gvk.Kind, appwrapper.Name, appwrapper.Status, err00) + klog.Errorf("[Cleanup] Error deleting generic item %s, from app wrapper=%s Status=%+v err=%+v.", + genericResourceName, appwrapper.Name, appwrapper.Status, err00) } + klog.Info("[Cleanup] Delete generic item %s, GVK=%s.%s.%s from app wrapper=%s Status=%+v", + genericResourceName, gvk.Group, gvk.Version, gvk.Kind, appwrapper.Name, appwrapper.Status) } } diff --git a/pkg/controller/queuejob/queuejob_suite_test.go b/pkg/controller/queuejob/queuejob_suite_test.go new file mode 100644 index 000000000..8b5fcf40e --- /dev/null +++ b/pkg/controller/queuejob/queuejob_suite_test.go @@ -0,0 +1,13 @@ +package queuejob_test + +import ( + "testing" + + "github.com/onsi/ginkgo" + "github.com/onsi/gomega" +) + +func TestQueuejob(t *testing.T) { + gomega.RegisterFailHandler(ginkgo.Fail) + ginkgo.RunSpecs(t, "Queuejob Suite") +} diff --git a/test/e2e/queue.go b/test/e2e/queue.go index b99441861..011b86147 100644 --- a/test/e2e/queue.go +++ b/test/e2e/queue.go @@ -297,10 +297,10 @@ var _ = Describe("AppWrapper E2E Test", func() { time.Sleep(2 * time.Minute) aw1, err := context.karclient.ArbV1().AppWrappers(aw.Namespace).Get(aw.Name, metav1.GetOptions{}) if err != nil { - fmt.Fprintf(os.Stdout, "Error getting status") + fmt.Fprint(GinkgoWriter, "Error getting status") } pass := false - fmt.Fprintf(os.Stdout, "[e2e] status of AW %v.\n", aw1.Status.State) + fmt.Fprintf(GinkgoWriter, "[e2e] status of AW %v.\n", aw1.Status.State) if len(aw1.Status.PendingPodConditions) == 0 { pass = true } @@ -526,10 +526,10 @@ var _ = Describe("AppWrapper E2E Test", func() { time.Sleep(60 * time.Second) aw1, err := context.karclient.ArbV1().AppWrappers(aw.Namespace).Get(aw.Name, metav1.GetOptions{}) if err != nil { - fmt.Fprintf(os.Stdout, "Error getting status") + fmt.Fprintf(GinkgoWriter, "Error getting status") } pass := false - fmt.Fprintf(os.Stdout, "[e2e] status of AW %v.\n", aw1.Status.State) + fmt.Fprintf(GinkgoWriter, "[e2e] status of AW %v.\n", aw1.Status.State) if aw1.Status.State == arbv1.AppWrapperStateFailed { pass = true } @@ -551,10 +551,10 @@ var _ = Describe("AppWrapper E2E Test", func() { time.Sleep(1 * time.Minute) aw1, err := context.karclient.ArbV1().AppWrappers(aw.Namespace).Get(aw.Name, metav1.GetOptions{}) if err != nil { - fmt.Fprintf(os.Stdout, "Error getting status") + fmt.Fprint(GinkgoWriter, "Error getting status") } pass := false - fmt.Fprintf(os.Stdout, "[e2e] status of AW %v.\n", aw1.Status.State) + fmt.Fprintf(GinkgoWriter, "[e2e] status of AW %v.\n", aw1.Status.State) if aw1.Status.State == arbv1.AppWrapperStateCompleted { pass = true } @@ -577,10 +577,10 @@ var _ = Describe("AppWrapper E2E Test", func() { time.Sleep(1 * time.Minute) aw1, err := context.karclient.ArbV1().AppWrappers(aw.Namespace).Get(aw.Name, metav1.GetOptions{}) if err != nil { - fmt.Fprintf(os.Stdout, "Error getting status") + fmt.Fprint(GinkgoWriter, "Error getting status") } pass := false - fmt.Fprintf(os.Stdout, "[e2e] status of AW %v.\n", aw1.Status.State) + fmt.Fprintf(GinkgoWriter, "[e2e] status of AW %v.\n", aw1.Status.State) if aw1.Status.State == arbv1.AppWrapperStateCompleted { pass = true } @@ -641,10 +641,10 @@ var _ = Describe("AppWrapper E2E Test", func() { time.Sleep(1 * time.Minute) aw1, err := context.karclient.ArbV1().AppWrappers(aw.Namespace).Get(aw.Name, metav1.GetOptions{}) if err != nil { - fmt.Fprintf(os.Stdout, "Error getting status") + fmt.Fprintf(GinkgoWriter, "Error getting status, %v", err) } pass := false - fmt.Fprintf(os.Stdout, "[e2e] status of AW %v.\n", aw1.Status.State) + fmt.Fprintf(GinkgoWriter, "[e2e] status of AW %v.\n", aw1.Status.State) if aw1.Status.State == arbv1.AppWrapperStateEnqueued { pass = true } @@ -689,10 +689,10 @@ var _ = Describe("AppWrapper E2E Test", func() { Expect(err1).NotTo(HaveOccurred()) aw1, err := context.karclient.ArbV1().AppWrappers(aw.Namespace).Get(aw.Name, metav1.GetOptions{}) if err != nil { - fmt.Fprintf(os.Stdout, "Error getting status") + fmt.Fprintf(GinkgoWriter, "Error getting status, %v", err) } pass := false - fmt.Fprintf(os.Stdout, "[e2e] status of AW %v.\n", aw1.Status.State) + fmt.Fprintf(GinkgoWriter, "[e2e] status of AW %v.\n", aw1.Status.State) if aw1.Status.State == arbv1.AppWrapperStateRunningHoldCompletion { pass = true } @@ -715,10 +715,10 @@ var _ = Describe("AppWrapper E2E Test", func() { Expect(err1).NotTo(HaveOccurred()) aw1, err := context.karclient.ArbV1().AppWrappers(aw.Namespace).Get(aw.Name, metav1.GetOptions{}) if err != nil { - fmt.Fprintf(os.Stdout, "Error getting status") + fmt.Fprintf(GinkgoWriter, "Error getting status, %v", err) } pass := false - fmt.Fprintf(os.Stdout, "[e2e] status of AW %v.\n", aw1.Status.State) + fmt.Fprintf(GinkgoWriter, "[e2e] status of AW %v.\n", aw1.Status.State) if aw1.Status.State == arbv1.AppWrapperStateActive { pass = true } @@ -733,8 +733,8 @@ var _ = Describe("AppWrapper E2E Test", func() { context := initTestContext() var aws []*arbv1.AppWrapper - // appwrappersPtr := &aws - // defer cleanupTestObjectsPtr(context, appwrappersPtr) + appwrappersPtr := &aws + defer cleanupTestObjectsPtr(context, appwrappersPtr) const ( awCount = 100 @@ -759,7 +759,7 @@ var _ = Describe("AppWrapper E2E Test", func() { name = fmt.Sprintf("%s%d", name, i+1) cpuDemand := "5m" if ((i+1)%modDivisor) == 0 || i == 0 { - fmt.Fprintf(os.Stdout, "[e2e] Creating AW %s with %s cpu and %d replica(s).\n", name, cpuDemand, replicas) + fmt.Fprintf(GinkgoWriter, "[e2e] Creating AW %s with %s cpu and %d replica(s).\n", name, cpuDemand, replicas) } aw := createGenericDeploymentWithCPUAW(context, name, cpuDemand, replicas) aws = append(aws, aw) @@ -768,9 +768,6 @@ var _ = Describe("AppWrapper E2E Test", func() { // Give the deployments time to create pods time.Sleep(2 * time.Minute) for i := 0; i < len(aws); i++ { - if ((i+1)%modDivisor) == 0 || i == 0 { - fmt.Fprintf(os.Stdout, "[e2e] Checking for %d replicas running for AW %s.\n", replicas, aws[i].Name) - } err := waitAWReadyQuiet(context, aws[i]) Expect(err).NotTo(HaveOccurred()) } diff --git a/test/e2e/util.go b/test/e2e/util.go index 5187413d4..55b222200 100644 --- a/test/e2e/util.go +++ b/test/e2e/util.go @@ -169,7 +169,7 @@ func cleanupTestContextExtendedTime(cxt *context, seconds time.Duration) { // Wait for namespace deleted. // err = wait.Poll(100*time.Millisecond, seconds, namespaceNotExist(cxt)) // if err != nil { - // fmt.Fprintf(os.Stdout, "[cleanupTestContextExtendedTime] Failure check for namespace: %s.\n", cxt.namespace) + // fmt.Fprintf(GinkgoWriter, "[cleanupTestContextExtendedTime] Failure check for namespace: %s.\n", cxt.namespace) // } //Expect(err).NotTo(HaveOccurred()) } @@ -407,12 +407,12 @@ func anyPodsExist(ctx *context, awNamespace string, awName string) wait.Conditio // First find a pod from the list that is part of the AW if awn, found := podFromPodList.Labels["appwrapper.mcad.ibm.com"]; !found || awn != awName { - //DEBUG fmt.Fprintf(os.Stdout, "[anyPodsExist] Pod %s in phase: %s not part of AppWrapper: %s, labels: %#v\n", + //DEBUG fmt.Fprintf(GinkgoWriter, "[anyPodsExist] Pod %s in phase: %s not part of AppWrapper: %s, labels: %#v\n", //DEBUG podFromPodList.Name, podFromPodList.Status.Phase, awName, podFromPodList.Labels) continue } podExistsNum++ - fmt.Fprintf(os.Stdout, "[anyPodsExist] Found Pod %s in phase: %s as part of AppWrapper: %s, labels: %#v\n", + fmt.Fprintf(GinkgoWriter, "[anyPodsExist] Found Pod %s in phase: %s as part of AppWrapper: %s, labels: %#v\n", podFromPodList.Name, podFromPodList.Status.Phase, awName, podFromPodList.Labels) } @@ -426,7 +426,7 @@ func podPhase(ctx *context, awNamespace string, awName string, pods []*v1.Pod, p Expect(err).NotTo(HaveOccurred()) if podList == nil || podList.Size() < 1 { - fmt.Fprintf(os.Stdout, "[podPhase] Listing podList found for Namespace: %s/%s resulting in no podList found that could match AppWrapper with pod count: %d\n", + fmt.Fprintf(GinkgoWriter, "[podPhase] Listing podList found for Namespace: %s/%s resulting in no podList found that could match AppWrapper with pod count: %d\n", awNamespace, awName, len(pods)) } @@ -436,7 +436,7 @@ func podPhase(ctx *context, awNamespace string, awName string, pods []*v1.Pod, p // First find a pod from the list that is part of the AW if awn, found := podFromPodList.Labels["appwrapper.mcad.ibm.com"]; !found || awn != awName { - fmt.Fprintf(os.Stdout, "[podPhase] Pod %s in phase: %s not part of AppWrapper: %s, labels: %#v\n", + fmt.Fprintf(GinkgoWriter, "[podPhase] Pod %s in phase: %s not part of AppWrapper: %s, labels: %#v\n", podFromPodList.Name, podFromPodList.Status.Phase, awName, podFromPodList.Labels) continue } @@ -458,8 +458,8 @@ func podPhase(ctx *context, awNamespace string, awName string, pods []*v1.Pod, p } } - if matchToPodsFromInput == false { - fmt.Fprintf(os.Stdout, "[podPhase] Pod %s in phase: %s does not match any input pods: %#v \n", + if !matchToPodsFromInput { + fmt.Fprintf(GinkgoWriter, "[podPhase] Pod %s in phase: %s does not match any input pods: %#v \n", podFromPodList.Name, podFromPodList.Status.Phase, inputPodIDs) } break @@ -478,7 +478,7 @@ func awStatePhase(ctx *context, aw *arbv1.AppWrapper, phase []arbv1.AppWrapperSt phaseCount := 0 if !quite { - fmt.Fprintf(os.Stdout, "[awStatePhase] AW %s found with state: %s.\n", aw.Name, aw.Status.State) + fmt.Fprintf(GinkgoWriter, "[awStatePhase] AW %s found with state: %s.\n", aw.Name, aw.Status.State) } for _, p := range phase { @@ -497,7 +497,7 @@ func cleanupTestObjectsPtr(context *context, appwrappersPtr *[]*arbv1.AppWrapper func cleanupTestObjectsPtrVerbose(context *context, appwrappersPtr *[]*arbv1.AppWrapper, verbose bool) { if appwrappersPtr == nil { - fmt.Fprintf(os.Stdout, "[cleanupTestObjectsPtr] No AppWrappers to cleanup.\n") + fmt.Fprintf(GinkgoWriter, "[cleanupTestObjectsPtr] No AppWrappers to cleanup.\n") } else { cleanupTestObjects(context, *appwrappersPtr) } @@ -509,7 +509,7 @@ func cleanupTestObjects(context *context, appwrappers []*arbv1.AppWrapper) { func cleanupTestObjectsVerbose(context *context, appwrappers []*arbv1.AppWrapper, verbose bool) { if appwrappers == nil { - fmt.Fprintf(os.Stdout, "[cleanupTestObjects] No AppWrappers to cleanup.\n") + fmt.Fprintf(GinkgoWriter, "[cleanupTestObjects] No AppWrappers to cleanup.\n") return } @@ -519,13 +519,13 @@ func cleanupTestObjectsVerbose(context *context, appwrappers []*arbv1.AppWrapper pods := getPodsOfAppWrapper(context, aw) awNamespace := aw.Namespace awName := aw.Name - fmt.Fprintf(os.Stdout, "[cleanupTestObjects] Deleting AW %s.\n", aw.Name) + fmt.Fprintf(GinkgoWriter, "[cleanupTestObjects] Deleting AW %s.\n", aw.Name) err := deleteAppWrapper(context, aw.Name) Expect(err).NotTo(HaveOccurred()) // Wait for the pods of the deleted the appwrapper to be destroyed for _, pod := range pods { - fmt.Fprintf(os.Stdout, "[cleanupTestObjects] Awaiting pod %s/%s to be deleted for AW %s.\n", + fmt.Fprintf(GinkgoWriter, "[cleanupTestObjects] Awaiting pod %s/%s to be deleted for AW %s.\n", pod.Namespace, pod.Name, aw.Name) } err = waitAWPodsDeleted(context, awNamespace, awName, pods) @@ -536,7 +536,7 @@ func cleanupTestObjectsVerbose(context *context, appwrappers []*arbv1.AppWrapper for _, pod := range pods { podExist, _ := context.kubeclient.CoreV1().Pods(pod.Namespace).Get(gcontext.Background(), pod.Name, metav1.GetOptions{}) if podExist != nil { - fmt.Fprintf(os.Stdout, "[cleanupTestObjects] Found pod %s/%s %s, not completedly deleted for AW %s.\n", podExist.Namespace, podExist.Name, podExist.Status.Phase, aw.Name) + fmt.Fprintf(GinkgoWriter, "[cleanupTestObjects] Found pod %s/%s %s, not completedly deleted for AW %s.\n", podExist.Namespace, podExist.Name, podExist.Status.Phase, aw.Name) podsStillExisting = append(podsStillExisting, podExist) } } @@ -560,7 +560,7 @@ func awPodPhase(ctx *context, aw *arbv1.AppWrapper, phase []v1.PodPhase, taskNum Expect(err).NotTo(HaveOccurred()) if podList == nil || podList.Size() < 1 { - fmt.Fprintf(os.Stdout, "[awPodPhase] Listing podList found for Namespace: %s resulting in no podList found that could match AppWrapper: %s \n", + fmt.Fprintf(GinkgoWriter, "[awPodPhase] Listing podList found for Namespace: %s resulting in no podList found that could match AppWrapper: %s \n", aw.Namespace, aw.Name) } @@ -568,7 +568,7 @@ func awPodPhase(ctx *context, aw *arbv1.AppWrapper, phase []v1.PodPhase, taskNum for _, pod := range podList.Items { if awn, found := pod.Labels["appwrapper.mcad.ibm.com"]; !found || awn != aw.Name { if !quite { - fmt.Fprintf(os.Stdout, "[awPodPhase] Pod %s not part of AppWrapper: %s, labels: %s\n", pod.Name, aw.Name, pod.Labels) + fmt.Fprintf(GinkgoWriter, "[awPodPhase] Pod %s not part of AppWrapper: %s, labels: %s\n", pod.Name, aw.Name, pod.Labels) } continue } @@ -576,7 +576,7 @@ func awPodPhase(ctx *context, aw *arbv1.AppWrapper, phase []v1.PodPhase, taskNum for _, p := range phase { if pod.Status.Phase == p { //DEBUGif quite { - //DEBUG fmt.Fprintf(os.Stdout, "[awPodPhase] Found pod %s of AppWrapper: %s, phase: %v\n", pod.Name, aw.Name, p) + //DEBUG fmt.Fprintf(GinkgoWriter, "[awPodPhase] Found pod %s of AppWrapper: %s, phase: %v\n", pod.Name, aw.Name, p) //DEBUG} readyTaskNum++ break @@ -584,7 +584,7 @@ func awPodPhase(ctx *context, aw *arbv1.AppWrapper, phase []v1.PodPhase, taskNum pMsg := pod.Status.Message if len(pMsg) > 0 { pReason := pod.Status.Reason - fmt.Fprintf(os.Stdout, "[awPodPhase] pod: %s, phase: %s, reason: %s, message: %s\n", pod.Name, p, pReason, pMsg) + fmt.Fprintf(GinkgoWriter, "[awPodPhase] pod: %s, phase: %s, reason: %s, message: %s\n", pod.Name, p, pReason, pMsg) } containerStatuses := pod.Status.ContainerStatuses for _, containerStatus := range containerStatuses { @@ -594,7 +594,7 @@ func awPodPhase(ctx *context, aw *arbv1.AppWrapper, phase []v1.PodPhase, taskNum if len(wMsg) > 0 { wReason := waitingState.Reason containerName := containerStatus.Name - fmt.Fprintf(os.Stdout, "[awPodPhase] condition for pod: %s, phase: %s, container name: %s, "+ + fmt.Fprintf(GinkgoWriter, "[awPodPhase] condition for pod: %s, phase: %s, container name: %s, "+ "reason: %s, message: %s\n", pod.Name, p, containerName, wReason, wMsg) } } @@ -604,7 +604,7 @@ func awPodPhase(ctx *context, aw *arbv1.AppWrapper, phase []v1.PodPhase, taskNum } //DEBUGif taskNum <= readyTaskNum && quite { - //DEBUG fmt.Fprintf(os.Stdout, "[awPodPhase] Successfully found %v podList of AppWrapper: %s, state: %s\n", readyTaskNum, aw.Name, aw.Status.State) + //DEBUG fmt.Fprintf(GinkgoWriter, "[awPodPhase] Successfully found %v podList of AppWrapper: %s, state: %s\n", readyTaskNum, aw.Name, aw.Status.State) //DEBUG} return taskNum <= readyTaskNum, nil