diff --git a/Makefile b/Makefile index b2673c4f8..203b02d17 100644 --- a/Makefile +++ b/Makefile @@ -20,6 +20,7 @@ BUNDLE_VERSION ?= $(VERSION:v%=%) # KUEUE_VERSION defines the default version of Kueue (used for testing) KUEUE_VERSION ?= v0.6.2 +USE_RHOAI ?= true # KUBERAY_VERSION defines the default version of the KubeRay operator (used for testing) KUBERAY_VERSION ?= v1.1.0 @@ -419,3 +420,164 @@ image-mnist-job-test-push: image-mnist-job-test-build ## Push container image wi .PHONY: kueue-setup kueue-setup: bash scripts/setup-kueue-resources.sh +# RHOAI/ODH related resources installation + +# Basic Usage +# all-in-one will create all resources necessary to create GPU enabled ML workloads via OpenShift AI +# Users have the choice between installing RHOAI and ODH +# For RHOAI use `make all-in-one` and to remove all of the operators run `make delete-all-in-one` +# For ODH use `make all-in-one -e USE_RHOAI=false` and to remove all of the operators run `make delete-all-in-one -e USE_RHOAI=false` + +##@ all-in-one +.PHONY: all-in-one +all-in-one: + @echo -e "\n ==> Installing Everything needed for distributed AI platform on OpenShift cluster \n" + -make install-nfd-operator + -make install-service-mesh-operator + -make install-ai-platform-operator + -make install-nvidia-operator + +.PHONY: delete-all-in-one +delete-all-in-one: + @echo -e "\n ==> Removing Everything needed for distributed AI platform on OpenShift cluster \n" + -make delete-nfd-operator + -make delete-ai-platform-operator + -make delete-service-mesh-operator + -make delete-nvidia-operator + +##@ general +.PHONY: delete-ai-platform-operator +delete-ai-platform-operator: +ifeq ($(USE_RHOAI), true) ## Delete RHOAI Operator + -make delete-rhoai-operator + -kubectl delete -f contrib/configuration/accelerator-profile.yaml -n redhat-ods-applications +else ## Delete Open Data Hub Operator + -make delete-opendatahub-operator + -kubectl delete -f contrib/configuration/accelerator-profile.yaml -n opendatahub +endif + +.PHONY: install-ai-platform-operator +install-ai-platform-operator: +ifeq ($(USE_RHOAI), true) ## Delete RHOAI Operator + -make install-rhoai-operator + -kubectl apply -f contrib/configuration/accelerator-profile.yaml -n redhat-ods-applications +else ## Delete Open Data Hub Operator + -make install-opendatahub-operator + -kubectl apply -f contrib/configuration/accelerator-profile.yaml -n opendatahub +endif + +.PHONY: delete-rhoai-operator +delete-rhoai-operator: ## Delete RHOAI Operator + @echo -e "\n ==> Deleting OpenShift AI Operator \n" + kubectl delete datasciencecluster/default-dsc + kubectl wait --for=delete datasciencecluster/default-dsc --timeout=180s + kubectl delete dsci/default-dsci + kubectl wait --for=delete dsci/default-dsci --timeout=180s + -kubectl delete subscription rhods-operator -n redhat-ods-operator + -export CLUSTER_SERVICE_VERSION=`kubectl get clusterserviceversion -n redhat-ods-operator -l operators.coreos.com/rhods-operator.redhat-ods-operator -o custom-columns=:metadata.name`; \ + kubectl delete clusterserviceversion $$CLUSTER_SERVICE_VERSION -n redhat-ods-operator + kubectl delete namespace redhat-ods-operator + +.PHONY: install-rhoai-operator +install-rhoai-operator: ## Install RHOAI Operator + @echo -e "\n ==> Installing OpenShift AI Operator \n" + -kubectl create ns redhat-ods-operator + kubectl create -f contrib/configuration/rhoai/rhoai-operator-subscription.yaml + @echo Waiting for rhoai-operator Subscription to be ready + kubectl wait -n redhat-ods-operator subscription/rhods-operator --for=jsonpath='{.status.state}'=AtLatestKnown --timeout=180s + @while [[ -z $$(kubectl get deployment/rhods-operator -n redhat-ods-operator) ]]; do echo "."; sleep 10; done + -export RHOAI_POD_NAME=`kubectl get -n redhat-ods-operator pod -o custom-columns=:metadata.name | grep rhods-operator`; \ + kubectl wait --for=condition=Ready pod/$$RHOAI_POD_NAME -n redhat-ods-operator + @echo -e "\n==> Creating default Data Science Cluster \n" + kubectl apply -f contrib/configuration/rhoai/default-dsci.yaml --server-side + kubectl apply -f contrib/configuration/rhoai/default-dsc.yaml --server-side + +.PHONY: delete-opendatahub-operator +delete-opendatahub-operator: ## Delete OpenDataHub operator + @echo -e "\n==> Deleting OpenDataHub Operator \n" + kubectl delete datasciencecluster/default-dsc + kubectl wait --for=delete datasciencecluster/default-dsc --timeout=180s + kubectl delete dsci/default-dsci + kubectl wait --for=delete dsci/default-dsci --timeout=180s + -kubectl delete subscription opendatahub-operator -n openshift-operators + -export CLUSTER_SERVICE_VERSION=`kubectl get clusterserviceversion -n openshift-operators -l operators.coreos.com/opendatahub-operator.openshift-operators -o custom-columns=:metadata.name`; \ + kubectl delete clusterserviceversion $$CLUSTER_SERVICE_VERSION -n openshift-operators + -kubectl delete namespace opendatahub + +.PHONY: install-opendatahub-operator +install-opendatahub-operator: ## Install OpenDataHub operator + @echo -e "\n==> Installing OpenDataHub Operator \n" + -kubectl create ns opendatahub + kubectl create -f contrib/configuration/odh/opendatahub-operator-subscription.yaml + @echo Waiting for opendatahub-operator Subscription to be ready + kubectl wait -n openshift-operators subscription/opendatahub-operator --for=jsonpath='{.status.state}'=AtLatestKnown --timeout=180s + @while [[ -z $$(kubectl get deployment/opendatahub-operator-controller-manager -n openshift-operators) ]]; do echo "."; sleep 10; done + kubectl wait --for=condition=available deployment/opendatahub-operator-controller-manager -n openshift-operators --timeout=180s + -export ODH_POD_NAME=`kubectl get -n openshift-operators pod -o custom-columns=:metadata.name | grep opendatahub-operator-controller-manager`; \ + kubectl wait --for=condition=Ready pod/$$ODH_POD_NAME -n openshift-operators + kubectl apply -f contrib/configuration/odh/default-dsci.yaml --server-side + kubectl apply -f contrib/configuration/odh/default-dsc.yaml --server-side + +.PHONY: delete-service-mesh-operator +delete-service-mesh-operator: ## Delete Service Mesh Operator + @echo -e "\n==> Deleting Service Mesh Operator \n" + kubectl delete subscription servicemeshoperator -n openshift-operators + -export CLUSTER_SERVICE_VERSION=`kubectl get clusterserviceversion -n openshift-operators -l operators.coreos.com/servicemeshoperator.openshift-operators -o custom-columns=:metadata.name`; \ + kubectl delete clusterserviceversion $$CLUSTER_SERVICE_VERSION -n openshift-operators + +.PHONY: install-service-mesh-operator +install-service-mesh-operator: ## Install Service Mesh Operator + @echo -e "\n==> Installing OpenShift Service Mesh Operator" + kubectl create -f contrib/configuration/service-mesh-operator-subscription.yaml + kubectl wait -n openshift-operators subscription/servicemeshoperator --for=jsonpath='{.status.state}'=AtLatestKnown --timeout=180s + @while [[ -z $$(kubectl get deployment/istio-operator -n openshift-operators) ]]; do echo "."; sleep 10; done + kubectl wait --for=condition=available deployment/istio-operator -n openshift-operators --timeout=180s + +##@ GPU Support +.PHONY: install-nfd-operator +install-nfd-operator: ## Install NFD operator ( Node Feature Discovery ) + @echo -e "\n==> Installing NFD Operator \n" + -kubectl create ns openshift-nfd + kubectl create -f contrib/configuration/nfd-operator-subscription.yaml + @echo -e "\n==> Creating default NodeFeatureDiscovery CR \n" + @while [[ -z $$(kubectl get customresourcedefinition nodefeaturediscoveries.nfd.openshift.io) ]]; do echo "."; sleep 10; done + @while [[ -z $$(kubectl get csv -n openshift-nfd --selector operators.coreos.com/nfd.openshift-nfd) ]]; do echo "."; sleep 10; done + kubectl get csv -n openshift-nfd --selector operators.coreos.com/nfd.openshift-nfd -ojsonpath={.items[0].metadata.annotations.alm-examples} | jq '.[] | select(.kind=="NodeFeatureDiscovery")' | kubectl apply -f - --validate=false + +.PHONY: delete-nfd-operator +delete-nfd-operator: ## Delete NFD operator + @echo -e "\n==> Deleting NodeFeatureDiscovery CR \n" + kubectl delete NodeFeatureDiscovery --all -n openshift-nfd + @while [[ -n $$(kubectl get NodeFeatureDiscovery -n openshift-nfd) ]]; do echo "."; sleep 10; done + @echo -e "\n==> Deleting NFD Operator \n" + -kubectl delete subscription nfd -n openshift-nfd + -export CLUSTER_SERVICE_VERSION=`kubectl get clusterserviceversion -n openshift-nfd -l operators.coreos.com/nfd.openshift-nfd -o custom-columns=:metadata.name`; \ + kubectl delete clusterserviceversion $$CLUSTER_SERVICE_VERSION -n openshift-nfd + -kubectl delete ns openshift-nfd + +.PHONY: install-nvidia-operator +install-nvidia-operator: ## Install nvidia operator + @echo -e "\n==> Installing nvidia Operator \n" + -kubectl create ns nvidia-gpu-operator + kubectl create -f contrib/configuration/nvidia-operator-subscription.yaml + @echo -e "\n==> Creating default ClusterPolicy CR \n" + @while [[ -z $$(kubectl get customresourcedefinition clusterpolicies.nvidia.com) ]]; do echo "."; sleep 10; done + @while [[ -z $$(kubectl get csv -n nvidia-gpu-operator --selector operators.coreos.com/gpu-operator-certified.nvidia-gpu-operator) ]]; do echo "."; sleep 10; done + kubectl get csv -n nvidia-gpu-operator --selector operators.coreos.com/gpu-operator-certified.nvidia-gpu-operator -ojsonpath={.items[0].metadata.annotations.alm-examples} | jq .[] | kubectl apply -f - +ifeq ($(USE_RHOAI), true) ## Additional steps required for RHOAI + kubectl delete configmap migration-gpu-status -n redhat-ods-applications --ignore-not-found=true + -export REPLICASET_NAME=`kubectl get replicaset -n redhat-ods-applications -l app=rhods-dashboard -o custom-columns=:metadata.name`; \ + kubectl delete replicaset $$REPLICASET_NAME -n redhat-ods-applications +endif + +.PHONY: delete-nvidia-operator +delete-nvidia-operator: ## Delete nvidia operator + @echo -e "\n==> Deleting ClusterPolicy CR \n" + kubectl delete --ignore-not-found=true NVIDIADriver gpu-driver + kubectl delete ClusterPolicy --all -n nvidia-gpu-operator + @while [[ -n $$(kubectl get ClusterPolicy -n nvidia-gpu-operator) ]]; do echo "."; sleep 10; done + @echo -e "\n==> Deleting nvidia Operator \n" + -kubectl delete subscription gpu-operator-certified -n nvidia-gpu-operator + -export CLUSTER_SERVICE_VERSION=`kubectl get clusterserviceversion -n nvidia-gpu-operator -l operators.coreos.com/gpu-operator-certified.nvidia-gpu-operator -o custom-columns=:metadata.name`; \ + kubectl delete clusterserviceversion $$CLUSTER_SERVICE_VERSION -n nvidia-gpu-operator + -kubectl delete ns nvidia-gpu-operator diff --git a/contrib/configuration/accelerator-profile.yaml b/contrib/configuration/accelerator-profile.yaml new file mode 100644 index 000000000..37f64ea04 --- /dev/null +++ b/contrib/configuration/accelerator-profile.yaml @@ -0,0 +1,9 @@ +apiVersion: dashboard.opendatahub.io/v1 +kind: AcceleratorProfile +metadata: + name: gpu-accelerator-profile +spec: { + displayName: nvidia-gpu, + enabled: true, + identifier: nvidia.com/gpu +} diff --git a/contrib/configuration/nfd-operator-subscription.yaml b/contrib/configuration/nfd-operator-subscription.yaml new file mode 100644 index 000000000..5653ac3a9 --- /dev/null +++ b/contrib/configuration/nfd-operator-subscription.yaml @@ -0,0 +1,22 @@ +apiVersion: operators.coreos.com/v1 +kind: OperatorGroup +metadata: + name: nfd + namespace: openshift-nfd +spec: + targetNamespaces: + - openshift-nfd +--- +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: nfd + labels: + operators.coreos.com/nfd.openshift-nfd: '' + namespace: openshift-nfd +spec: + channel: stable + name: nfd + installPlanApproval: Automatic + source: redhat-operators + sourceNamespace: openshift-marketplace diff --git a/contrib/configuration/nvidia-operator-subscription.yaml b/contrib/configuration/nvidia-operator-subscription.yaml new file mode 100644 index 000000000..3fa10297c --- /dev/null +++ b/contrib/configuration/nvidia-operator-subscription.yaml @@ -0,0 +1,22 @@ +apiVersion: operators.coreos.com/v1 +kind: OperatorGroup +metadata: + name: gpu-operator-certified + namespace: nvidia-gpu-operator +spec: + targetNamespaces: + - opendatahub +--- +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: gpu-operator-certified + labels: + operators.coreos.com/gpu-operator-certified.nvidia-gpu-operator: '' + namespace: nvidia-gpu-operator +spec: + channel: stable + name: gpu-operator-certified + installPlanApproval: Automatic + source: certified-operators + sourceNamespace: openshift-marketplace diff --git a/contrib/configuration/odh/default-dsc.yaml b/contrib/configuration/odh/default-dsc.yaml new file mode 100644 index 000000000..d5e4d902a --- /dev/null +++ b/contrib/configuration/odh/default-dsc.yaml @@ -0,0 +1,40 @@ +kind: DataScienceCluster +apiVersion: datasciencecluster.opendatahub.io/v1 +metadata: + labels: + app.kubernetes.io/created-by: opendatahub-operator + app.kubernetes.io/instance: default + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: datasciencecluster + app.kubernetes.io/part-of: opendatahub-operator + name: default-dsc +spec: + components: + codeflare: + managementState: Managed + dashboard: + managementState: Managed + datasciencepipelines: + managementState: Managed + kserve: + managementState: Managed + serving: + ingressGateway: + certificate: + type: SelfSigned + managementState: Managed + name: knative-serving + kueue: + managementState: Managed + modelmeshserving: + managementState: Managed + modelregistry: + managementState: Removed + ray: + managementState: Managed + trainingoperator: + managementState: Removed + trustyai: + managementState: Managed + workbenches: + managementState: Managed diff --git a/contrib/configuration/odh/default-dsci.yaml b/contrib/configuration/odh/default-dsci.yaml new file mode 100644 index 000000000..e9f1b2995 --- /dev/null +++ b/contrib/configuration/odh/default-dsci.yaml @@ -0,0 +1,24 @@ +kind: DSCInitialization +apiVersion: dscinitialization.opendatahub.io/v1 +metadata: + labels: + app.kubernetes.io/created-by: opendatahub-operator + app.kubernetes.io/instance: default + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: dscinitialization + app.kubernetes.io/part-of: opendatahub-operator + name: default-dsci +spec: + applicationsNamespace: opendatahub + monitoring: + managementState: Managed + namespace: opendatahub + serviceMesh: + controlPlane: + metricsCollection: Istio + name: data-science-smcp + namespace: istio-system + managementState: Managed + trustedCABundle: + customCABundle: '' + managementState: Managed diff --git a/contrib/configuration/odh/opendatahub-operator-subscription.yaml b/contrib/configuration/odh/opendatahub-operator-subscription.yaml new file mode 100644 index 000000000..6f7521595 --- /dev/null +++ b/contrib/configuration/odh/opendatahub-operator-subscription.yaml @@ -0,0 +1,13 @@ +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: opendatahub-operator + labels: + operators.coreos.com/opendatahub-operator.openshift-operators: '' + namespace: openshift-operators +spec: + channel: fast + name: opendatahub-operator + installPlanApproval: Automatic + source: community-operators + sourceNamespace: openshift-marketplace diff --git a/contrib/configuration/rhoai/default-dsc.yaml b/contrib/configuration/rhoai/default-dsc.yaml new file mode 100644 index 000000000..718d87b07 --- /dev/null +++ b/contrib/configuration/rhoai/default-dsc.yaml @@ -0,0 +1,34 @@ +kind: DataScienceCluster +apiVersion: datasciencecluster.opendatahub.io/v1 +metadata: + labels: + app.kubernetes.io/created-by: rhods-operator + app.kubernetes.io/instance: default-dsc + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: datasciencecluster + app.kubernetes.io/part-of: rhods-operator + name: default-dsc +spec: + components: + codeflare: + managementState: Managed + dashboard: + managementState: Managed + datasciencepipelines: + managementState: Managed + kserve: + managementState: Managed + serving: + ingressGateway: + certificate: + type: SelfSigned + managementState: Managed + name: knative-serving + kueue: + managementState: Managed + modelmeshserving: + managementState: Managed + ray: + managementState: Managed + workbenches: + managementState: Managed diff --git a/contrib/configuration/rhoai/default-dsci.yaml b/contrib/configuration/rhoai/default-dsci.yaml new file mode 100644 index 000000000..91d3d1b00 --- /dev/null +++ b/contrib/configuration/rhoai/default-dsci.yaml @@ -0,0 +1,24 @@ +kind: DSCInitialization +apiVersion: dscinitialization.opendatahub.io/v1 +metadata: + labels: + app.kubernetes.io/created-by: rhods-operator + app.kubernetes.io/instance: default-dsci + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: dscinitialization + app.kubernetes.io/part-of: rhods-operator + name: default-dsci +spec: + applicationsNamespace: redhat-ods-applications + monitoring: + managementState: Managed + namespace: redhat-ods-monitoring + serviceMesh: + controlPlane: + metricsCollection: Istio + name: data-science-smcp + namespace: istio-system + managementState: Managed + trustedCABundle: + customCABundle: '' + managementState: Managed diff --git a/contrib/configuration/rhoai/rhoai-operator-subscription.yaml b/contrib/configuration/rhoai/rhoai-operator-subscription.yaml new file mode 100644 index 000000000..15373ca51 --- /dev/null +++ b/contrib/configuration/rhoai/rhoai-operator-subscription.yaml @@ -0,0 +1,19 @@ +apiVersion: operators.coreos.com/v1 +kind: OperatorGroup +metadata: + name: rhods-operator + namespace: redhat-ods-operator +--- +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: rhods-operator + labels: + operators.coreos.com/rhods-operator.redhat-ods-operator: '' + namespace: redhat-ods-operator +spec: + channel: fast + name: rhods-operator + installPlanApproval: Automatic + source: redhat-operators + sourceNamespace: openshift-marketplace diff --git a/contrib/configuration/service-mesh-operator-subscription.yaml b/contrib/configuration/service-mesh-operator-subscription.yaml new file mode 100644 index 000000000..9e21a4c77 --- /dev/null +++ b/contrib/configuration/service-mesh-operator-subscription.yaml @@ -0,0 +1,13 @@ +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: servicemeshoperator + labels: + operators.coreos.com/servicemeshoperator.openshift-operators: '' + namespace: openshift-operators +spec: + channel: stable + name: servicemeshoperator + installPlanApproval: Automatic + source: redhat-operators + sourceNamespace: openshift-marketplace