From 6485723d5fc60a322c46915f77db9d910b77b35c Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Wed, 14 Jul 2021 22:37:30 +0300 Subject: [PATCH 01/40] Limit istiod to 5 replicas and the default namespace --- manager/install.sh | 1 + manager/manifests/istio.yaml.j2 | 22 +++++++++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/manager/install.sh b/manager/install.sh index 7d63049871..7b873eaba9 100755 --- a/manager/install.sh +++ b/manager/install.sh @@ -361,6 +361,7 @@ function remove_nodegroups() { function setup_istio() { envsubst < manifests/istio-namespace.yaml | kubectl apply -f - >/dev/null + kubectl label namespaces default istio-discovery=enabled if ! grep -q "istio-customgateway-certs" <<< $(kubectl get secret -n istio-system); then WEBSITE=localhost diff --git a/manager/manifests/istio.yaml.j2 b/manager/manifests/istio.yaml.j2 index ecccf38695..53714590ba 100644 --- a/manager/manifests/istio.yaml.j2 +++ b/manager/manifests/istio.yaml.j2 @@ -18,6 +18,10 @@ spec: profile: minimal hub: {{ env['CORTEX_IMAGE_ISTIO_PROXY_HUB'] }} # this is only used by proxy, since pilot overrides it (proxy doesn't have dedicated hub config) tag: {{ env['CORTEX_IMAGE_ISTIO_PROXY_TAG'] }} # this is only used by proxy, since pilot overrides it (proxy doesn't have dedicated tag config) + meshConfig: + discoverySelectors: + - matchLabels: + istio-discovery: enabled components: pilot: # "pilot" refers to the istiod container hub: {{ env['CORTEX_IMAGE_ISTIO_PILOT_HUB'] }} @@ -26,7 +30,23 @@ spec: resources: requests: cpu: 100m # default is 500m - memory: 200Mi # default is 2048Mi == 2Gi + memory: 700Mi # default is 2048Mi == 2Gi + hpaSpec: + minReplicas: 1 + maxReplicas: 5 # edit autoscaleEnabled in values if increasing this + metrics: + - type: Resource + resource: + name: cpu + targetAverageUtilization: 95 + - type: Resource + resource: + name: memory + targetAverageUtilization: 80 + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: istiod cni: enabled: false ingressGateways: From 9e5dfafb602bf206decc5a0f9e0bc6055b4f7583 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Wed, 14 Jul 2021 22:48:00 +0300 Subject: [PATCH 02/40] Move prometheus exporters and fluent bit to their dedicated namespace --- manager/manifests/fluent-bit.yaml.j2 | 10 +++++----- manager/manifests/prometheus-dcgm-exporter.yaml | 6 +++--- manager/manifests/prometheus-kube-state-metrics.yaml | 8 ++++---- manager/manifests/prometheus-kubelet-exporter.yaml | 2 +- manager/manifests/prometheus-monitoring.yaml | 2 +- manager/manifests/prometheus-node-exporter.yaml | 12 ++++++------ manager/manifests/prometheus-operator.yaml | 8 ++++---- manager/manifests/prometheus-statsd-exporter.yaml | 6 +++--- 8 files changed, 27 insertions(+), 27 deletions(-) diff --git a/manager/manifests/fluent-bit.yaml.j2 b/manager/manifests/fluent-bit.yaml.j2 index a2e1140f2c..4e75212e35 100644 --- a/manager/manifests/fluent-bit.yaml.j2 +++ b/manager/manifests/fluent-bit.yaml.j2 @@ -16,7 +16,7 @@ apiVersion: v1 kind: ServiceAccount metadata: name: fluent-bit - namespace: default + namespace: logging --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole @@ -40,13 +40,13 @@ roleRef: subjects: - kind: ServiceAccount name: fluent-bit - namespace: default + namespace: logging --- apiVersion: v1 kind: ConfigMap metadata: name: fluent-bit-config - namespace: default + namespace: logging labels: k8s-app: fluent-bit data: @@ -83,7 +83,7 @@ data: [FILTER] Name kubernetes Match kube.var.log.containers.* - Kube_URL https://kubernetes.default.svc:443 + Kube_URL https://kubernetes.logging.svc:443 Kube_Tag_Prefix kube.var.log.containers. Merge_Log On @@ -186,7 +186,7 @@ apiVersion: apps/v1 kind: DaemonSet metadata: name: fluent-bit - namespace: default + namespace: logging spec: selector: matchLabels: diff --git a/manager/manifests/prometheus-dcgm-exporter.yaml b/manager/manifests/prometheus-dcgm-exporter.yaml index 8b37d969dd..e93ce14e9e 100644 --- a/manager/manifests/prometheus-dcgm-exporter.yaml +++ b/manager/manifests/prometheus-dcgm-exporter.yaml @@ -21,7 +21,7 @@ apiVersion: v1 kind: ServiceAccount metadata: name: dcgm-exporter - namespace: default + namespace: prometheus labels: app.kubernetes.io/name: dcgm-exporter app.kubernetes.io/instance: dcgm-exporter @@ -31,7 +31,7 @@ apiVersion: apps/v1 kind: DaemonSet metadata: name: dcgm-exporter - namespace: default + namespace: prometheus labels: app.kubernetes.io/name: dcgm-exporter app.kubernetes.io/instance: dcgm-exporter @@ -106,7 +106,7 @@ apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: dcgm-exporter - namespace: default + namespace: prometheus labels: monitoring.cortex.dev: dcgm-exporter app.kubernetes.io/name: dcgm-exporter diff --git a/manager/manifests/prometheus-kube-state-metrics.yaml b/manager/manifests/prometheus-kube-state-metrics.yaml index edf69cd7ba..89da6c4842 100644 --- a/manager/manifests/prometheus-kube-state-metrics.yaml +++ b/manager/manifests/prometheus-kube-state-metrics.yaml @@ -17,7 +17,7 @@ metadata: labels: app.kubernetes.io/name: kube-state-metrics name: kube-state-metrics - namespace: default + namespace: prometheus --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole @@ -180,13 +180,13 @@ roleRef: subjects: - kind: ServiceAccount name: kube-state-metrics - namespace: default + namespace: prometheus --- apiVersion: apps/v1 kind: Deployment metadata: name: kube-state-metrics - namespace: default + namespace: prometheus labels: app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/version: "2.1.0" @@ -245,7 +245,7 @@ apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: kube-state-metrics - namespace: default + namespace: prometheus labels: name: kube-state-metrics monitoring.cortex.dev: kube-state-metrics diff --git a/manager/manifests/prometheus-kubelet-exporter.yaml b/manager/manifests/prometheus-kubelet-exporter.yaml index 8982706c42..87855746a0 100644 --- a/manager/manifests/prometheus-kubelet-exporter.yaml +++ b/manager/manifests/prometheus-kubelet-exporter.yaml @@ -19,7 +19,7 @@ metadata: k8s-app: kubelet monitoring.cortex.dev: kubelet-exporter name: kubelet - namespace: default + namespace: prometheus spec: endpoints: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token diff --git a/manager/manifests/prometheus-monitoring.yaml b/manager/manifests/prometheus-monitoring.yaml index 0982504aff..fbfc6309da 100644 --- a/manager/manifests/prometheus-monitoring.yaml +++ b/manager/manifests/prometheus-monitoring.yaml @@ -114,7 +114,7 @@ roleRef: subjects: - kind: ServiceAccount name: prometheus - namespace: default + namespace: prometheus --- diff --git a/manager/manifests/prometheus-node-exporter.yaml b/manager/manifests/prometheus-node-exporter.yaml index 3bb631e15d..c381f22902 100644 --- a/manager/manifests/prometheus-node-exporter.yaml +++ b/manager/manifests/prometheus-node-exporter.yaml @@ -18,7 +18,7 @@ metadata: labels: app.kubernetes.io/version: v1.1.2 name: node-exporter - namespace: default + namespace: prometheus --- @@ -57,7 +57,7 @@ roleRef: subjects: - kind: ServiceAccount name: node-exporter - namespace: default + namespace: prometheus --- @@ -68,7 +68,7 @@ metadata: app.kubernetes.io/name: node-exporter app.kubernetes.io/version: v1.1.2 name: node-exporter - namespace: default + namespace: prometheus spec: clusterIP: None ports: @@ -87,7 +87,7 @@ metadata: app.kubernetes.io/name: node-exporter app.kubernetes.io/version: v1.1.2 name: node-exporter - namespace: default + namespace: prometheus spec: selector: matchLabels: @@ -181,7 +181,7 @@ metadata: app.kubernetes.io/version: v1.1.2 monitoring.cortex.dev: node-exporter name: node-exporter - namespace: default + namespace: prometheus spec: endpoints: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token @@ -240,7 +240,7 @@ metadata: app.kubernetes.io/version: 1.1.2 prometheus: k8s name: node-exporter-rules - namespace: default + namespace: prometheus spec: groups: - name: node-exporter.rules diff --git a/manager/manifests/prometheus-operator.yaml b/manager/manifests/prometheus-operator.yaml index 3b7b558318..fc610a31a6 100644 --- a/manager/manifests/prometheus-operator.yaml +++ b/manager/manifests/prometheus-operator.yaml @@ -14073,7 +14073,7 @@ roleRef: subjects: - kind: ServiceAccount name: prometheus-operator - namespace: default + namespace: prometheus --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole @@ -14163,7 +14163,7 @@ metadata: app.kubernetes.io/name: prometheus-operator app.kubernetes.io/version: 0.48.1 name: prometheus-operator - namespace: default + namespace: prometheus spec: replicas: 1 selector: @@ -14217,7 +14217,7 @@ metadata: app.kubernetes.io/name: prometheus-operator app.kubernetes.io/version: 0.48.1 name: prometheus-operator - namespace: default + namespace: prometheus --- apiVersion: v1 kind: Service @@ -14227,7 +14227,7 @@ metadata: app.kubernetes.io/name: prometheus-operator app.kubernetes.io/version: 0.48.1 name: prometheus-operator - namespace: default + namespace: prometheus spec: clusterIP: None ports: diff --git a/manager/manifests/prometheus-statsd-exporter.yaml b/manager/manifests/prometheus-statsd-exporter.yaml index ea58db52d8..1a1fe6dd33 100644 --- a/manager/manifests/prometheus-statsd-exporter.yaml +++ b/manager/manifests/prometheus-statsd-exporter.yaml @@ -16,7 +16,7 @@ apiVersion: v1 kind: ConfigMap metadata: name: prometheus-statsd-exporter-config - namespace: default + namespace: prometheus data: statsd-mapping.yaml: | defaults: @@ -27,7 +27,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: prometheus-statsd-exporter - namespace: default + namespace: prometheus spec: replicas: 1 selector: @@ -93,7 +93,7 @@ spec: apiVersion: v1 kind: Service metadata: - namespace: default + namespace: prometheus name: prometheus-statsd-exporter labels: cortex.dev/name: prometheus-statsd-exporter From 060ebb933e0aa63d3d9f4eb39847b66a8d51542c Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Wed, 14 Jul 2021 23:07:50 +0300 Subject: [PATCH 03/40] Change the HPA's targets a bit --- manager/manifests/istio.yaml.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/manager/manifests/istio.yaml.j2 b/manager/manifests/istio.yaml.j2 index 53714590ba..b537e08c7b 100644 --- a/manager/manifests/istio.yaml.j2 +++ b/manager/manifests/istio.yaml.j2 @@ -38,11 +38,11 @@ spec: - type: Resource resource: name: cpu - targetAverageUtilization: 95 + targetAverageUtilization: 90 - type: Resource resource: name: memory - targetAverageUtilization: 80 + targetAverageUtilization: 90 scaleTargetRef: apiVersion: apps/v1 kind: Deployment From d65fafcdf40ab4153ceebf1391b3122adfec30ae Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Thu, 15 Jul 2021 00:33:55 +0300 Subject: [PATCH 04/40] Redirect kubectl's output to /dev/null --- manager/install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manager/install.sh b/manager/install.sh index 7b873eaba9..367ee76e5a 100755 --- a/manager/install.sh +++ b/manager/install.sh @@ -361,7 +361,7 @@ function remove_nodegroups() { function setup_istio() { envsubst < manifests/istio-namespace.yaml | kubectl apply -f - >/dev/null - kubectl label namespaces default istio-discovery=enabled + kubectl label namespaces default istio-discovery=enabled >/dev/null if ! grep -q "istio-customgateway-certs" <<< $(kubectl get secret -n istio-system); then WEBSITE=localhost From 4b96c20e0c9728c5b3e488167e92c487c698f6be Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Thu, 15 Jul 2021 01:09:14 +0300 Subject: [PATCH 05/40] Add logging namespace --- manager/manifests/fluent-bit.yaml.j2 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/manager/manifests/fluent-bit.yaml.j2 b/manager/manifests/fluent-bit.yaml.j2 index 4e75212e35..b4f3ac68df 100644 --- a/manager/manifests/fluent-bit.yaml.j2 +++ b/manager/manifests/fluent-bit.yaml.j2 @@ -12,6 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +apiVersion: v1 +kind: Namespace +metadata: + name: logging +--- apiVersion: v1 kind: ServiceAccount metadata: From 067720840ec5e040b13f84262c09ae52ebb2dcb5 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Thu, 15 Jul 2021 01:10:35 +0300 Subject: [PATCH 06/40] Remove unnecessary namespace --- manager/manifests/prometheus-dcgm-exporter.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/manager/manifests/prometheus-dcgm-exporter.yaml b/manager/manifests/prometheus-dcgm-exporter.yaml index e93ce14e9e..96d82a5644 100644 --- a/manager/manifests/prometheus-dcgm-exporter.yaml +++ b/manager/manifests/prometheus-dcgm-exporter.yaml @@ -12,11 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -apiVersion: v1 -kind: Namespace -metadata: - name: monitoring ---- apiVersion: v1 kind: ServiceAccount metadata: From bbcf4e39abc88ea54c36183eed6c6623f1c85d6c Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Thu, 15 Jul 2021 01:25:42 +0300 Subject: [PATCH 07/40] Fixes --- manager/install.sh | 5 +++-- manager/manifests/default-namespace.yaml | 20 ++++++++++++++++++++ manager/manifests/prometheus-namespace.yaml | 18 ++++++++++++++++++ 3 files changed, 41 insertions(+), 2 deletions(-) create mode 100644 manager/manifests/default-namespace.yaml create mode 100644 manager/manifests/prometheus-namespace.yaml diff --git a/manager/install.sh b/manager/install.sh index 367ee76e5a..2450863425 100755 --- a/manager/install.sh +++ b/manager/install.sh @@ -220,6 +220,7 @@ function setup_configmap() { } function setup_prometheus() { + kubectl apply -f manifests/prometheus-namespace.yaml >/dev/null envsubst < manifests/prometheus-operator.yaml | kubectl apply -f - >/dev/null envsubst < manifests/prometheus-statsd-exporter.yaml | kubectl apply -f - >/dev/null envsubst < manifests/prometheus-kubelet-exporter.yaml | kubectl apply -f - >/dev/null @@ -360,8 +361,8 @@ function remove_nodegroups() { } function setup_istio() { - envsubst < manifests/istio-namespace.yaml | kubectl apply -f - >/dev/null - kubectl label namespaces default istio-discovery=enabled >/dev/null + kubectl apply -f manifests/istio-namespace.yaml >/dev/null + kubectl apply -f manifests/default-namespace.yaml >/dev/null if ! grep -q "istio-customgateway-certs" <<< $(kubectl get secret -n istio-system); then WEBSITE=localhost diff --git a/manager/manifests/default-namespace.yaml b/manager/manifests/default-namespace.yaml new file mode 100644 index 0000000000..af30d267ec --- /dev/null +++ b/manager/manifests/default-namespace.yaml @@ -0,0 +1,20 @@ +# Copyright 2021 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Namespace +metadata: + name: default + labels: + istio-discovery: enabled diff --git a/manager/manifests/prometheus-namespace.yaml b/manager/manifests/prometheus-namespace.yaml new file mode 100644 index 0000000000..995138ef09 --- /dev/null +++ b/manager/manifests/prometheus-namespace.yaml @@ -0,0 +1,18 @@ +# Copyright 2021 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Namespace +metadata: + name: prometheus From da66f7f05817cae1ea071580339773b0c9b572a7 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Thu, 15 Jul 2021 01:56:14 +0300 Subject: [PATCH 08/40] Refactoring a bit --- manager/install.sh | 7 ++++--- .../{default-namespace.yaml => namespaces/default.yaml} | 0 .../{istio-namespace.yaml => namespaces/istio.yaml} | 0 .../prometheus.yaml} | 0 4 files changed, 4 insertions(+), 3 deletions(-) rename manager/manifests/{default-namespace.yaml => namespaces/default.yaml} (100%) rename manager/manifests/{istio-namespace.yaml => namespaces/istio.yaml} (100%) rename manager/manifests/{prometheus-namespace.yaml => namespaces/prometheus.yaml} (100%) diff --git a/manager/install.sh b/manager/install.sh index 2450863425..770d0bfec0 100755 --- a/manager/install.sh +++ b/manager/install.sh @@ -220,7 +220,7 @@ function setup_configmap() { } function setup_prometheus() { - kubectl apply -f manifests/prometheus-namespace.yaml >/dev/null + kubectl apply -f manifests/namespaces/prometheus.yaml >/dev/null envsubst < manifests/prometheus-operator.yaml | kubectl apply -f - >/dev/null envsubst < manifests/prometheus-statsd-exporter.yaml | kubectl apply -f - >/dev/null envsubst < manifests/prometheus-kubelet-exporter.yaml | kubectl apply -f - >/dev/null @@ -361,8 +361,9 @@ function remove_nodegroups() { } function setup_istio() { - kubectl apply -f manifests/istio-namespace.yaml >/dev/null - kubectl apply -f manifests/default-namespace.yaml >/dev/null + kubectl apply -f manifests/namespaces/istio.yaml >/dev/null + # to apply the istio-discovery label + kubectl apply -f manifests/namespaces/default.yaml >/dev/null if ! grep -q "istio-customgateway-certs" <<< $(kubectl get secret -n istio-system); then WEBSITE=localhost diff --git a/manager/manifests/default-namespace.yaml b/manager/manifests/namespaces/default.yaml similarity index 100% rename from manager/manifests/default-namespace.yaml rename to manager/manifests/namespaces/default.yaml diff --git a/manager/manifests/istio-namespace.yaml b/manager/manifests/namespaces/istio.yaml similarity index 100% rename from manager/manifests/istio-namespace.yaml rename to manager/manifests/namespaces/istio.yaml diff --git a/manager/manifests/prometheus-namespace.yaml b/manager/manifests/namespaces/prometheus.yaml similarity index 100% rename from manager/manifests/prometheus-namespace.yaml rename to manager/manifests/namespaces/prometheus.yaml From 56554fbd187f58f4787425e42d294767eb1356aa Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Thu, 15 Jul 2021 01:59:23 +0300 Subject: [PATCH 09/40] Fixes --- manager/manifests/event-exporter.yaml | 8 ++++---- manager/manifests/fluent-bit.yaml.j2 | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/manager/manifests/event-exporter.yaml b/manager/manifests/event-exporter.yaml index ab4847c4a7..3bc701f2a1 100644 --- a/manager/manifests/event-exporter.yaml +++ b/manager/manifests/event-exporter.yaml @@ -15,7 +15,7 @@ apiVersion: v1 kind: ServiceAccount metadata: - namespace: default + namespace: prometheus name: event-exporter --- @@ -30,7 +30,7 @@ roleRef: name: view subjects: - kind: ServiceAccount - namespace: default + namespace: prometheus name: event-exporter --- @@ -39,7 +39,7 @@ apiVersion: v1 kind: ConfigMap metadata: name: event-exporter-config - namespace: default + namespace: prometheus data: config.yaml: | logLevel: error @@ -61,7 +61,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: event-exporter - namespace: default + namespace: prometheus spec: replicas: 1 selector: diff --git a/manager/manifests/fluent-bit.yaml.j2 b/manager/manifests/fluent-bit.yaml.j2 index b4f3ac68df..2bff58b1fb 100644 --- a/manager/manifests/fluent-bit.yaml.j2 +++ b/manager/manifests/fluent-bit.yaml.j2 @@ -88,7 +88,7 @@ data: [FILTER] Name kubernetes Match kube.var.log.containers.* - Kube_URL https://kubernetes.logging.svc:443 + Kube_URL https://kubernetes.default.svc:443 Kube_Tag_Prefix kube.var.log.containers. Merge_Log On From 77dc2e91bf55b00c4f5c7096403515605ae13fab Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Thu, 15 Jul 2021 02:12:50 +0300 Subject: [PATCH 10/40] More refactoring --- manager/install.sh | 19 +++++++++++++------ manager/manifests/fluent-bit.yaml.j2 | 5 ----- manager/manifests/namespaces/logging.yaml | 18 ++++++++++++++++++ pkg/workloads/k8s.go | 2 +- 4 files changed, 32 insertions(+), 12 deletions(-) create mode 100644 manager/manifests/namespaces/logging.yaml diff --git a/manager/install.sh b/manager/install.sh index 770d0bfec0..04bc320dfc 100755 --- a/manager/install.sh +++ b/manager/install.sh @@ -33,7 +33,11 @@ function main() { } function cluster_up() { - create_eks + check_eks + + echo -n "○ creating namespaces " + setup_namespaces + echo "✓" echo -n "○ updating cluster configuration " setup_configmap @@ -195,6 +199,14 @@ function write_kubeconfig() { out=$(kubectl get pods 2>&1 || true); if [[ "$out" == *"must be logged in to the server"* ]]; then echo "error: your aws iam user does not have access to this cluster; to grant access, see https://docs.cortex.dev/v/${CORTEX_VERSION_MINOR}/"; exit 1; fi } +function setup_namespaces() { + # to apply the istio-discovery label + kubectl apply -f manifests/namespaces/default.yaml >/dev/null + kubectl apply -f manifests/namespaces/istio.yaml >/dev/null + kubectl apply -f manifests/namespaces/prometheus.yaml >/dev/null + kubectl apply -f manifests/namespaces/logging.yaml >/dev/null +} + function setup_configmap() { envsubst < manifests/default_cortex_cli_config.yaml > tmp_cli_config.yaml kubectl -n=default create configmap 'client-config' \ @@ -220,7 +232,6 @@ function setup_configmap() { } function setup_prometheus() { - kubectl apply -f manifests/namespaces/prometheus.yaml >/dev/null envsubst < manifests/prometheus-operator.yaml | kubectl apply -f - >/dev/null envsubst < manifests/prometheus-statsd-exporter.yaml | kubectl apply -f - >/dev/null envsubst < manifests/prometheus-kubelet-exporter.yaml | kubectl apply -f - >/dev/null @@ -361,10 +372,6 @@ function remove_nodegroups() { } function setup_istio() { - kubectl apply -f manifests/namespaces/istio.yaml >/dev/null - # to apply the istio-discovery label - kubectl apply -f manifests/namespaces/default.yaml >/dev/null - if ! grep -q "istio-customgateway-certs" <<< $(kubectl get secret -n istio-system); then WEBSITE=localhost openssl req -subj "/C=US/CN=$WEBSITE" -newkey rsa:2048 -nodes -keyout $WEBSITE.key -x509 -days 3650 -out $WEBSITE.crt >/dev/null 2>&1 diff --git a/manager/manifests/fluent-bit.yaml.j2 b/manager/manifests/fluent-bit.yaml.j2 index 2bff58b1fb..03fef37062 100644 --- a/manager/manifests/fluent-bit.yaml.j2 +++ b/manager/manifests/fluent-bit.yaml.j2 @@ -12,11 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -apiVersion: v1 -kind: Namespace -metadata: - name: logging ---- apiVersion: v1 kind: ServiceAccount metadata: diff --git a/manager/manifests/namespaces/logging.yaml b/manager/manifests/namespaces/logging.yaml new file mode 100644 index 0000000000..02fe6ce079 --- /dev/null +++ b/manager/manifests/namespaces/logging.yaml @@ -0,0 +1,18 @@ +# Copyright 2021 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Namespace +metadata: + name: logging diff --git a/pkg/workloads/k8s.go b/pkg/workloads/k8s.go index 06e0526472..ecf25b3aef 100644 --- a/pkg/workloads/k8s.go +++ b/pkg/workloads/k8s.go @@ -61,7 +61,7 @@ const ( _clusterConfigConfigMap = "cluster-config" _clusterConfigDir = "/configs/cluster" - _statsdAddress = "prometheus-statsd-exporter.default:9125" + _statsdAddress = "prometheus-statsd-exporter.prometheus:9125" ) var ( From 50249d826c621529d8dfe688ae0bc46b87466dcd Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Thu, 15 Jul 2021 02:15:11 +0300 Subject: [PATCH 11/40] Event exporter must be in logging namespace --- manager/manifests/event-exporter.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/manager/manifests/event-exporter.yaml b/manager/manifests/event-exporter.yaml index 3bc701f2a1..8ff19efb09 100644 --- a/manager/manifests/event-exporter.yaml +++ b/manager/manifests/event-exporter.yaml @@ -15,7 +15,7 @@ apiVersion: v1 kind: ServiceAccount metadata: - namespace: prometheus + namespace: logging name: event-exporter --- @@ -30,7 +30,7 @@ roleRef: name: view subjects: - kind: ServiceAccount - namespace: prometheus + namespace: logging name: event-exporter --- @@ -39,7 +39,7 @@ apiVersion: v1 kind: ConfigMap metadata: name: event-exporter-config - namespace: prometheus + namespace: logging data: config.yaml: | logLevel: error @@ -61,7 +61,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: event-exporter - namespace: prometheus + namespace: logging spec: replicas: 1 selector: From 23918dfeacc8a267eae62312522fd755fc334b5f Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Thu, 15 Jul 2021 02:27:15 +0300 Subject: [PATCH 12/40] Use consts where possible --- manager/manifests/prometheus-monitoring.yaml | 2 ++ pkg/config/config.go | 4 +-- pkg/consts/consts.go | 7 +++-- pkg/health/health.go | 31 ++++++++++---------- pkg/workloads/k8s.go | 5 ++-- 5 files changed, 28 insertions(+), 21 deletions(-) diff --git a/manager/manifests/prometheus-monitoring.yaml b/manager/manifests/prometheus-monitoring.yaml index fbfc6309da..f42e0d078f 100644 --- a/manager/manifests/prometheus-monitoring.yaml +++ b/manager/manifests/prometheus-monitoring.yaml @@ -27,6 +27,7 @@ apiVersion: monitoring.coreos.com/v1 kind: Prometheus metadata: name: prometheus + namespace: prometheus spec: image: $CORTEX_IMAGE_PROMETHEUS serviceAccountName: prometheus @@ -73,6 +74,7 @@ apiVersion: v1 kind: ServiceAccount metadata: name: prometheus + namespace: prometheus --- diff --git a/pkg/config/config.go b/pkg/config/config.go index 304ee1dcf3..eb7bd5e269 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -152,7 +152,7 @@ func Init() error { prometheusURL := os.Getenv("CORTEX_PROMETHEUS_URL") if len(prometheusURL) == 0 { - prometheusURL = fmt.Sprintf("http://prometheus.%s:9090", consts.DefaultNamespace) + prometheusURL = fmt.Sprintf("http://prometheus.%s:9090", consts.PrometheusNamespace) } promClient, err := promapi.NewClient(promapi.Config{ @@ -168,7 +168,7 @@ func Init() error { } if OperatorMetadata.IsOperatorInCluster { - MetricsClient, err = statsd.New(fmt.Sprintf("prometheus-statsd-exporter.%s:9125", consts.DefaultNamespace)) + MetricsClient, err = statsd.New(fmt.Sprintf("prometheus-statsd-exporter.%s:9125", consts.PrometheusNamespace)) if err != nil { return errors.Wrap(errors.WithStack(err), "unable to initialize metrics client") } diff --git a/pkg/consts/consts.go b/pkg/consts/consts.go index 67a9c7e4d0..4b06b6b589 100644 --- a/pkg/consts/consts.go +++ b/pkg/consts/consts.go @@ -27,8 +27,11 @@ var ( CortexVersion = "master" // CORTEX_VERSION CortexVersionMinor = "master" // CORTEX_VERSION_MINOR - DefaultNamespace = "default" - IstioNamespace = "istio-system" + KubeSystemNamespace = "kube-system" + DefaultNamespace = "default" + PrometheusNamespace = "prometheus" + LoggingNamespace = "logging" + IstioNamespace = "istio-system" DefaultMaxQueueLength = int64(100) DefaultMaxConcurrency = int64(1) diff --git a/pkg/health/health.go b/pkg/health/health.go index 45543ec220..cd556dd80b 100644 --- a/pkg/health/health.go +++ b/pkg/health/health.go @@ -22,6 +22,7 @@ import ( "reflect" "github.com/aws/aws-sdk-go/service/elbv2" + "github.com/cortexlabs/cortex/pkg/consts" awslib "github.com/cortexlabs/cortex/pkg/lib/aws" "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/json" @@ -106,47 +107,47 @@ func Check(awsClient *awslib.Client, k8sClient *k8s.Client, clusterName string) if err := parallel.RunFirstErr( func() error { var err error - operatorHealth, err = getDeploymentReadiness(k8sClient, "operator", "default") + operatorHealth, err = getDeploymentReadiness(k8sClient, "operator", consts.DefaultNamespace) return err }, func() error { var err error - controllerManagerHealth, err = getDeploymentReadiness(k8sClient, "operator-controller-manager", "default") + controllerManagerHealth, err = getDeploymentReadiness(k8sClient, "operator-controller-manager", consts.DefaultNamespace) return err }, func() error { var err error - prometheusHealth, err = getStatefulSetReadiness(k8sClient, "prometheus-prometheus", "default") + prometheusHealth, err = getStatefulSetReadiness(k8sClient, "prometheus-prometheus", consts.PrometheusNamespace) return err }, func() error { var err error - autoscalerHealth, err = getDeploymentReadiness(k8sClient, "autoscaler", "default") + autoscalerHealth, err = getDeploymentReadiness(k8sClient, "autoscaler", consts.DefaultNamespace) return err }, func() error { var err error - activatorHealth, err = getDeploymentReadiness(k8sClient, "activator", "default") + activatorHealth, err = getDeploymentReadiness(k8sClient, "activator", consts.DefaultNamespace) return err }, func() error { var err error - grafanaHealth, err = getStatefulSetReadiness(k8sClient, "grafana", "default") + grafanaHealth, err = getStatefulSetReadiness(k8sClient, "grafana", consts.DefaultNamespace) return err }, func() error { var err error - operatorGatewayHealth, err = getDeploymentReadiness(k8sClient, "ingressgateway-operator", "istio-system") + operatorGatewayHealth, err = getDeploymentReadiness(k8sClient, "ingressgateway-operator", consts.IstioNamespace) return err }, func() error { var err error - apisGatewayHealth, err = getDeploymentReadiness(k8sClient, "ingressgateway-apis", "istio-system") + apisGatewayHealth, err = getDeploymentReadiness(k8sClient, "ingressgateway-apis", consts.IstioNamespace) return err }, func() error { var err error - clusterAutoscalerHealth, err = getDeploymentReadiness(k8sClient, "cluster-autoscaler", "kube-system") + clusterAutoscalerHealth, err = getDeploymentReadiness(k8sClient, "cluster-autoscaler", consts.KubeSystemNamespace) return err }, func() error { @@ -161,32 +162,32 @@ func Check(awsClient *awslib.Client, k8sClient *k8s.Client, clusterName string) }, func() error { var err error - fluentBitHealth, err = getDaemonSetReadiness(k8sClient, "fluent-bit", "default") + fluentBitHealth, err = getDaemonSetReadiness(k8sClient, "fluent-bit", consts.LoggingNamespace) return err }, func() error { var err error - dcgmExporterHealth, err = getDaemonSetReadiness(k8sClient, "dcgm-exporter", "default") + dcgmExporterHealth, err = getDaemonSetReadiness(k8sClient, "dcgm-exporter", consts.PrometheusNamespace) return err }, func() error { var err error - nodeExporterHealth, err = getDaemonSetReadiness(k8sClient, "node-exporter", "default") + nodeExporterHealth, err = getDaemonSetReadiness(k8sClient, "node-exporter", consts.PrometheusNamespace) return err }, func() error { var err error - statsdExporterHealth, err = getDeploymentReadiness(k8sClient, "prometheus-statsd-exporter", "default") + statsdExporterHealth, err = getDeploymentReadiness(k8sClient, "prometheus-statsd-exporter", consts.PrometheusNamespace) return err }, func() error { var err error - eventExporterHealth, err = getDeploymentReadiness(k8sClient, "event-exporter", "default") + eventExporterHealth, err = getDeploymentReadiness(k8sClient, "event-exporter", consts.LoggingNamespace) return err }, func() error { var err error - kubeStateMetricsHealth, err = getDeploymentReadiness(k8sClient, "kube-state-metrics", "default") + kubeStateMetricsHealth, err = getDeploymentReadiness(k8sClient, "kube-state-metrics", consts.PrometheusNamespace) return err }, ); err != nil { diff --git a/pkg/workloads/k8s.go b/pkg/workloads/k8s.go index ecf25b3aef..0523c8cbfe 100644 --- a/pkg/workloads/k8s.go +++ b/pkg/workloads/k8s.go @@ -17,6 +17,7 @@ limitations under the License. package workloads import ( + "fmt" "path" "strings" @@ -60,14 +61,14 @@ const ( _clusterConfigDirVolume = "cluster-config" _clusterConfigConfigMap = "cluster-config" _clusterConfigDir = "/configs/cluster" - - _statsdAddress = "prometheus-statsd-exporter.prometheus:9125" ) var ( _asyncGatewayCPURequest = kresource.MustParse("100m") _asyncGatewayMemRequest = kresource.MustParse("100Mi") + _statsdAddress = fmt.Sprintf("prometheus-statsd-exporter.%s:9125", consts.PrometheusNamespace) + // each Inferentia chip requires 128 HugePages with each HugePage having a size of 2Mi _hugePagesMemPerInf = int64(128 * 2 * 1024 * 1024) // bytes ) From a2874571a6901865ed5e26e0d13fb21ace315663 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Thu, 15 Jul 2021 02:42:10 +0300 Subject: [PATCH 13/40] Have all namespaces in a single yaml & revert temporary change --- manager/install.sh | 16 +++++----------- .../default.yaml => namespaces.yaml} | 19 +++++++++++++++++++ manager/manifests/namespaces/istio.yaml | 18 ------------------ manager/manifests/namespaces/logging.yaml | 18 ------------------ manager/manifests/namespaces/prometheus.yaml | 18 ------------------ 5 files changed, 24 insertions(+), 65 deletions(-) rename manager/manifests/{namespaces/default.yaml => namespaces.yaml} (77%) delete mode 100644 manager/manifests/namespaces/istio.yaml delete mode 100644 manager/manifests/namespaces/logging.yaml delete mode 100644 manager/manifests/namespaces/prometheus.yaml diff --git a/manager/install.sh b/manager/install.sh index 04bc320dfc..89d87b3906 100755 --- a/manager/install.sh +++ b/manager/install.sh @@ -33,10 +33,10 @@ function main() { } function cluster_up() { - check_eks + create_eks echo -n "○ creating namespaces " - setup_namespaces + kubectl apply -f manifests/namespaces.yaml >/dev/null echo "✓" echo -n "○ updating cluster configuration " @@ -199,14 +199,6 @@ function write_kubeconfig() { out=$(kubectl get pods 2>&1 || true); if [[ "$out" == *"must be logged in to the server"* ]]; then echo "error: your aws iam user does not have access to this cluster; to grant access, see https://docs.cortex.dev/v/${CORTEX_VERSION_MINOR}/"; exit 1; fi } -function setup_namespaces() { - # to apply the istio-discovery label - kubectl apply -f manifests/namespaces/default.yaml >/dev/null - kubectl apply -f manifests/namespaces/istio.yaml >/dev/null - kubectl apply -f manifests/namespaces/prometheus.yaml >/dev/null - kubectl apply -f manifests/namespaces/logging.yaml >/dev/null -} - function setup_configmap() { envsubst < manifests/default_cortex_cli_config.yaml > tmp_cli_config.yaml kubectl -n=default create configmap 'client-config' \ @@ -239,7 +231,9 @@ function setup_prometheus() { envsubst < manifests/prometheus-node-exporter.yaml | kubectl apply -f - >/dev/null envsubst < manifests/prometheus-monitoring.yaml | kubectl apply -f - >/dev/null python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/prometheus-additional-scrape-configs.yaml.j2 > prometheus-additional-scrape-configs.yaml - kubectl create secret generic additional-scrape-configs --from-file=prometheus-additional-scrape-configs.yaml + if ! kubectl get secret additional-scrape-configs >/dev/null 2>&1; then + kubectl create secret generic additional-scrape-configs --from-file=prometheus-additional-scrape-configs.yaml + fi } function setup_grafana() { diff --git a/manager/manifests/namespaces/default.yaml b/manager/manifests/namespaces.yaml similarity index 77% rename from manager/manifests/namespaces/default.yaml rename to manager/manifests/namespaces.yaml index af30d267ec..e6612f5304 100644 --- a/manager/manifests/namespaces/default.yaml +++ b/manager/manifests/namespaces.yaml @@ -18,3 +18,22 @@ metadata: name: default labels: istio-discovery: enabled +--- + +apiVersion: v1 +kind: Namespace +metadata: + name: istio-system +--- + +apiVersion: v1 +kind: Namespace +metadata: + name: logging +--- + +apiVersion: v1 +kind: Namespace +metadata: + name: prometheus +--- diff --git a/manager/manifests/namespaces/istio.yaml b/manager/manifests/namespaces/istio.yaml deleted file mode 100644 index 3f5ce71534..0000000000 --- a/manager/manifests/namespaces/istio.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright 2021 Cortex Labs, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: v1 -kind: Namespace -metadata: - name: istio-system diff --git a/manager/manifests/namespaces/logging.yaml b/manager/manifests/namespaces/logging.yaml deleted file mode 100644 index 02fe6ce079..0000000000 --- a/manager/manifests/namespaces/logging.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright 2021 Cortex Labs, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: v1 -kind: Namespace -metadata: - name: logging diff --git a/manager/manifests/namespaces/prometheus.yaml b/manager/manifests/namespaces/prometheus.yaml deleted file mode 100644 index 995138ef09..0000000000 --- a/manager/manifests/namespaces/prometheus.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright 2021 Cortex Labs, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: v1 -kind: Namespace -metadata: - name: prometheus From 34045d44e0cb48fb304b0e1620be6b23e70a72b3 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Thu, 15 Jul 2021 03:24:08 +0300 Subject: [PATCH 14/40] Patch instead of applying the default namespace --- manager/install.sh | 10 ++++++++-- manager/manifests/namespaces.yaml | 8 -------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/manager/install.sh b/manager/install.sh index 89d87b3906..18e5a0936b 100755 --- a/manager/install.sh +++ b/manager/install.sh @@ -36,7 +36,7 @@ function cluster_up() { create_eks echo -n "○ creating namespaces " - kubectl apply -f manifests/namespaces.yaml >/dev/null + setup_namespaces echo "✓" echo -n "○ updating cluster configuration " @@ -199,6 +199,12 @@ function write_kubeconfig() { out=$(kubectl get pods 2>&1 || true); if [[ "$out" == *"must be logged in to the server"* ]]; then echo "error: your aws iam user does not have access to this cluster; to grant access, see https://docs.cortex.dev/v/${CORTEX_VERSION_MINOR}/"; exit 1; fi } +function setup_namespaces() { + # doing a patch to prevent getting the kubectl.kubernetes.io/last-applied-configuration annotation warning + kubectl patch namespace default -p '{"metadata": {"labels": {"istio-discovery": "enabled"}}}' >/dev/null + kubectl apply -f manifests/namespaces.yaml >/dev/null +} + function setup_configmap() { envsubst < manifests/default_cortex_cli_config.yaml > tmp_cli_config.yaml kubectl -n=default create configmap 'client-config' \ @@ -232,7 +238,7 @@ function setup_prometheus() { envsubst < manifests/prometheus-monitoring.yaml | kubectl apply -f - >/dev/null python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/prometheus-additional-scrape-configs.yaml.j2 > prometheus-additional-scrape-configs.yaml if ! kubectl get secret additional-scrape-configs >/dev/null 2>&1; then - kubectl create secret generic additional-scrape-configs --from-file=prometheus-additional-scrape-configs.yaml + kubectl create secret generic additional-scrape-configs --from-file=prometheus-additional-scrape-configs.yaml > /dev/null fi } diff --git a/manager/manifests/namespaces.yaml b/manager/manifests/namespaces.yaml index e6612f5304..ce1959e923 100644 --- a/manager/manifests/namespaces.yaml +++ b/manager/manifests/namespaces.yaml @@ -12,14 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -apiVersion: v1 -kind: Namespace -metadata: - name: default - labels: - istio-discovery: enabled ---- - apiVersion: v1 kind: Namespace metadata: From b24dff9e750b3643a26773f005f516459067bec0 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Thu, 15 Jul 2021 03:29:50 +0300 Subject: [PATCH 15/40] Fix waiting on the load balancer stage --- manager/install.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/manager/install.sh b/manager/install.sh index 18e5a0936b..8ce233fb0d 100755 --- a/manager/install.sh +++ b/manager/install.sh @@ -540,8 +540,8 @@ function validate_cortex() { fi if [ "$prometheus_ready" == "" ]; then - readyReplicas=$(kubectl get statefulset -n default prometheus-prometheus -o jsonpath='{.status.readyReplicas}' 2> /dev/null) - desiredReplicas=$(kubectl get statefulset -n default prometheus-prometheus -o jsonpath='{.status.replicas}' 2> /dev/null) + readyReplicas=$(kubectl get statefulset -n prometheus prometheus-prometheus -o jsonpath='{.status.readyReplicas}' 2> /dev/null) + desiredReplicas=$(kubectl get statefulset -n prometheus prometheus-prometheus -o jsonpath='{.status.replicas}' 2> /dev/null) if [ "$readyReplicas" != "" ] && [ "$desiredReplicas" != "" ]; then if [ "$readyReplicas" == "$desiredReplicas" ]; then From 6c9421d0374f31e0bd14a1a13f9de59af0486fbb Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Thu, 15 Jul 2021 16:00:13 +0300 Subject: [PATCH 16/40] More namespace fixes --- cli/cmd/cluster.go | 4 ++-- manager/manifests/autoscaler.yaml.j2 | 2 +- manager/manifests/grafana/grafana.yaml.j2 | 2 +- pkg/consts/consts.go | 4 ++-- pkg/crds/hack/run_manager.sh | 2 +- pkg/crds/main.go | 2 +- pkg/health/health.go | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go index 3311c60a01..f89ba386f3 100644 --- a/cli/cmd/cluster.go +++ b/cli/cmd/cluster.go @@ -358,7 +358,7 @@ var _clusterConfigureCmd = &cobra.Command{ exit.Error(err) } - k8sClient, err := k8s.New("default", false, restConfig, scheme) + k8sClient, err := k8s.New(consts.DefaultNamespace, false, restConfig, scheme) if err != nil { exit.Error(err) } @@ -804,7 +804,7 @@ var _clusterHealthCmd = &cobra.Command{ exit.Error(err) } - k8sClient, err := k8s.New("default", false, restConfig, scheme) + k8sClient, err := k8s.New(consts.DefaultNamespace, false, restConfig, scheme) if err != nil { exit.Error(err) } diff --git a/manager/manifests/autoscaler.yaml.j2 b/manager/manifests/autoscaler.yaml.j2 index df42a90965..842552f31a 100644 --- a/manager/manifests/autoscaler.yaml.j2 +++ b/manager/manifests/autoscaler.yaml.j2 @@ -82,7 +82,7 @@ spec: args: - "--in-cluster" - "--port=8000" - - "--prometheus-url=http://prometheus.default:9090" + - "--prometheus-url=http://prometheus.prometheus:9090" - "--namespace=default" - "--cluster-config=/configs/cluster/cluster.yaml" ports: diff --git a/manager/manifests/grafana/grafana.yaml.j2 b/manager/manifests/grafana/grafana.yaml.j2 index ec863d3fb5..1cd9052393 100644 --- a/manager/manifests/grafana/grafana.yaml.j2 +++ b/manager/manifests/grafana/grafana.yaml.j2 @@ -28,7 +28,7 @@ data: "name": "prometheus", "orgId": 1, "type": "prometheus", - "url": "http://prometheus.default:9090", + "url": "http://prometheus.prometheus:9090", "version": 1, "isDefault": true } diff --git a/pkg/consts/consts.go b/pkg/consts/consts.go index 4b06b6b589..7ea590fc45 100644 --- a/pkg/consts/consts.go +++ b/pkg/consts/consts.go @@ -27,11 +27,11 @@ var ( CortexVersion = "master" // CORTEX_VERSION CortexVersionMinor = "master" // CORTEX_VERSION_MINOR - KubeSystemNamespace = "kube-system" DefaultNamespace = "default" + KubeSystemNamespace = "kube-system" + IstioNamespace = "istio-system" PrometheusNamespace = "prometheus" LoggingNamespace = "logging" - IstioNamespace = "istio-system" DefaultMaxQueueLength = int64(100) DefaultMaxConcurrency = int64(1) diff --git a/pkg/crds/hack/run_manager.sh b/pkg/crds/hack/run_manager.sh index 1b7d1a79fc..acd0ac9b37 100755 --- a/pkg/crds/hack/run_manager.sh +++ b/pkg/crds/hack/run_manager.sh @@ -18,7 +18,7 @@ CLUSTER_CONFIG=$1 -port_forward_cmd="kubectl port-forward -n default prometheus-prometheus-0 9090" +port_forward_cmd="kubectl port-forward -n prometheus prometheus-prometheus-0 9090" kill $(pgrep -f "${port_forward_cmd}") >/dev/null 2>&1 || true echo "Port-forwarding Prometheus to localhost:9090" diff --git a/pkg/crds/main.go b/pkg/crds/main.go index 62b552a426..ee8c0c476b 100644 --- a/pkg/crds/main.go +++ b/pkg/crds/main.go @@ -102,7 +102,7 @@ func main() { } if prometheusURL == "" { - prometheusURL = fmt.Sprintf("http://prometheus.%s:9090", consts.DefaultNamespace) + prometheusURL = fmt.Sprintf("http://prometheus.%s:9090", consts.PrometheusNamespace) } mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ diff --git a/pkg/health/health.go b/pkg/health/health.go index cd556dd80b..7e7ea9e9d5 100644 --- a/pkg/health/health.go +++ b/pkg/health/health.go @@ -218,7 +218,7 @@ func Check(awsClient *awslib.Client, k8sClient *k8s.Client, clusterName string) func GetWarnings(k8sClient *k8s.Client) (ClusterWarnings, error) { var prometheusMemorySaturationWarn string - saturation, err := getPodMemorySaturation(k8sClient, "prometheus-prometheus-0", "default") + saturation, err := getPodMemorySaturation(k8sClient, "prometheus-prometheus-0", consts.PrometheusNamespace) if err != nil { return ClusterWarnings{}, err } From 4f4daefa4a1ae13e9c7a9379c798c44227189a82 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Thu, 15 Jul 2021 16:26:20 +0300 Subject: [PATCH 17/40] Create additional-scrape-configs in prometheus ns --- manager/install.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/manager/install.sh b/manager/install.sh index 8ce233fb0d..c5c0881c46 100755 --- a/manager/install.sh +++ b/manager/install.sh @@ -237,8 +237,8 @@ function setup_prometheus() { envsubst < manifests/prometheus-node-exporter.yaml | kubectl apply -f - >/dev/null envsubst < manifests/prometheus-monitoring.yaml | kubectl apply -f - >/dev/null python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/prometheus-additional-scrape-configs.yaml.j2 > prometheus-additional-scrape-configs.yaml - if ! kubectl get secret additional-scrape-configs >/dev/null 2>&1; then - kubectl create secret generic additional-scrape-configs --from-file=prometheus-additional-scrape-configs.yaml > /dev/null + if ! kubectl get secret -n prometheus additional-scrape-configs >/dev/null 2>&1; then + kubectl create secret generic -n prometheus additional-scrape-configs --from-file=prometheus-additional-scrape-configs.yaml > /dev/null fi } From 2d572aa3f057934daec649b73d168ac96bf2f596 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Thu, 15 Jul 2021 16:26:34 +0300 Subject: [PATCH 18/40] Prometheus' service must be in prometheus ns --- manager/manifests/prometheus-monitoring.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/manager/manifests/prometheus-monitoring.yaml b/manager/manifests/prometheus-monitoring.yaml index f42e0d078f..bda334e4d0 100644 --- a/manager/manifests/prometheus-monitoring.yaml +++ b/manager/manifests/prometheus-monitoring.yaml @@ -124,6 +124,7 @@ apiVersion: v1 kind: Service metadata: name: prometheus + namespace: prometheus spec: type: ClusterIP ports: From fde6ab05351fd1a487f8a9b4e7a418569dcc579e Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Thu, 15 Jul 2021 17:53:03 +0300 Subject: [PATCH 19/40] Fix cortex cluster health cmd panicking --- cli/cmd/cluster.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go index f89ba386f3..bb5d7882fb 100644 --- a/cli/cmd/cluster.go +++ b/cli/cmd/cluster.go @@ -814,6 +814,11 @@ var _clusterHealthCmd = &cobra.Command{ exit.Error(err) } + k8sClient, err = k8s.New(consts.PrometheusNamespace, false, restConfig, scheme) + if err != nil { + exit.Error(err) + } + clusterWarnings, err := health.GetWarnings(k8sClient) if err != nil { exit.Error(err) From 4c6b670d816c128f12b3bc94f029f47ac28e8dd4 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Thu, 15 Jul 2021 17:57:58 +0300 Subject: [PATCH 20/40] Fix getPodMemorySaturation function instead --- cli/cmd/cluster.go | 5 ----- pkg/health/health.go | 8 ++++++-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go index bb5d7882fb..f89ba386f3 100644 --- a/cli/cmd/cluster.go +++ b/cli/cmd/cluster.go @@ -814,11 +814,6 @@ var _clusterHealthCmd = &cobra.Command{ exit.Error(err) } - k8sClient, err = k8s.New(consts.PrometheusNamespace, false, restConfig, scheme) - if err != nil { - exit.Error(err) - } - clusterWarnings, err := health.GetWarnings(k8sClient) if err != nil { exit.Error(err) diff --git a/pkg/health/health.go b/pkg/health/health.go index 7e7ea9e9d5..014e4b9f48 100644 --- a/pkg/health/health.go +++ b/pkg/health/health.go @@ -30,6 +30,7 @@ import ( "github.com/cortexlabs/cortex/pkg/lib/parallel" "github.com/cortexlabs/cortex/pkg/types/clusterconfig" kapps "k8s.io/api/apps/v1" + v1 "k8s.io/api/core/v1" kerrors "k8s.io/apimachinery/pkg/api/errors" kresource "k8s.io/apimachinery/pkg/api/resource" kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -296,8 +297,11 @@ func getLoadBalancerHealth(awsClient *awslib.Client, clusterName string, loadBal func getPodMemorySaturation(k8sClient *k8s.Client, podName, namespace string) (float64, error) { ctx := context.Background() - pod, err := k8sClient.GetPod(podName) - if err != nil { + var pod v1.Pod + if err := k8sClient.Get(ctx, ctrlclient.ObjectKey{ + Namespace: namespace, + Name: podName, + }, &pod); err != nil { return 0, err } From 042b2b3a45332de4149b1770a5beb9b76cb3a844 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Thu, 15 Jul 2021 20:16:07 +0300 Subject: [PATCH 21/40] Report live (x/y) and up-to-date replicas --- cli/cluster/delete.go | 2 +- cli/cmd/get.go | 8 ++- cli/cmd/lib_async_apis.go | 25 ++-------- cli/cmd/lib_batch_apis.go | 2 +- cli/cmd/lib_realtime_apis.go | 21 ++------ cli/cmd/lib_task_apis.go | 2 +- cli/cmd/lib_traffic_splitters.go | 13 ++--- pkg/operator/resources/asyncapi/status.go | 52 ++++---------------- pkg/operator/resources/realtimeapi/api.go | 4 +- pkg/operator/resources/realtimeapi/status.go | 48 +++++------------- pkg/operator/resources/resources.go | 5 +- pkg/types/status/status.go | 13 ++--- 12 files changed, 52 insertions(+), 143 deletions(-) diff --git a/cli/cluster/delete.go b/cli/cluster/delete.go index e81624f98d..7b1d96d86d 100644 --- a/cli/cluster/delete.go +++ b/cli/cluster/delete.go @@ -70,7 +70,7 @@ func getReadyRealtimeAPIReplicasOrNil(operatorConfig OperatorConfig, apiName str return nil } - totalReady := apiRes.Status.Updated.Ready + apiRes.Status.Stale.Ready + totalReady := apiRes.Status.Ready return &totalReady } diff --git a/cli/cmd/get.go b/cli/cmd/get.go index 1b11b984a0..e366b432e4 100644 --- a/cli/cmd/get.go +++ b/cli/cmd/get.go @@ -41,12 +41,10 @@ import ( const ( _titleEnvironment = "env" _titleRealtimeAPI = "realtime api" - _titleStatus = "status" + _titleAsyncAPI = "async api" + _titleLive = "live" _titleUpToDate = "up-to-date" - _titleStale = "stale" - _titleRequested = "requested" - _titleFailed = "failed" - _titleLastupdated = "last update" + _titleLastUpdated = "last update" ) var ( diff --git a/cli/cmd/lib_async_apis.go b/cli/cmd/lib_async_apis.go index 114c88bca8..3b0347e51c 100644 --- a/cli/cmd/lib_async_apis.go +++ b/cli/cmd/lib_async_apis.go @@ -17,6 +17,7 @@ limitations under the License. package cmd import ( + "fmt" "strings" "time" @@ -27,10 +28,6 @@ import ( "github.com/cortexlabs/cortex/pkg/operator/schema" ) -const ( - _titleAsyncAPI = "async api" -) - func asyncAPITable(asyncAPI schema.APIResponse, env cliconfig.Environment) (string, error) { var out string @@ -60,36 +57,24 @@ func asyncAPITable(asyncAPI schema.APIResponse, env cliconfig.Environment) (stri func asyncAPIsTable(asyncAPIs []schema.APIResponse, envNames []string) table.Table { rows := make([][]interface{}, 0, len(asyncAPIs)) - var totalFailed int32 - var totalStale int32 - for i, asyncAPI := range asyncAPIs { lastUpdated := time.Unix(asyncAPI.Spec.LastUpdated, 0) rows = append(rows, []interface{}{ envNames[i], asyncAPI.Spec.Name, - asyncAPI.Status.Message(), - asyncAPI.Status.Updated.Ready, - asyncAPI.Status.Stale.Ready, - asyncAPI.Status.Requested, - asyncAPI.Status.Updated.TotalFailed(), + fmt.Sprintf("%d/%d", asyncAPI.Status.Ready, asyncAPI.Status.Requested), + asyncAPI.Status.UpToDate, libtime.SinceStr(&lastUpdated), }) - - totalFailed += asyncAPI.Status.Updated.TotalFailed() - totalStale += asyncAPI.Status.Stale.Ready } return table.Table{ Headers: []table.Header{ {Title: _titleEnvironment}, {Title: _titleAsyncAPI}, - {Title: _titleStatus}, + {Title: _titleLive}, {Title: _titleUpToDate}, - {Title: _titleStale, Hidden: totalStale == 0}, - {Title: _titleRequested}, - {Title: _titleFailed, Hidden: totalFailed == 0}, - {Title: _titleLastupdated}, + {Title: _titleLastUpdated}, }, Rows: rows, } diff --git a/cli/cmd/lib_batch_apis.go b/cli/cmd/lib_batch_apis.go index 272dbfa0fa..ac43150a53 100644 --- a/cli/cmd/lib_batch_apis.go +++ b/cli/cmd/lib_batch_apis.go @@ -74,7 +74,7 @@ func batchAPIsTable(batchAPIs []schema.APIResponse, envNames []string) table.Tab {Title: _titleBatchAPI}, {Title: _titleJobCount}, {Title: _titleLatestJobID}, - {Title: _titleLastupdated}, + {Title: _titleLastUpdated}, }, Rows: rows, } diff --git a/cli/cmd/lib_realtime_apis.go b/cli/cmd/lib_realtime_apis.go index be4316e0a8..48fdab941c 100644 --- a/cli/cmd/lib_realtime_apis.go +++ b/cli/cmd/lib_realtime_apis.go @@ -17,6 +17,7 @@ limitations under the License. package cmd import ( + "fmt" "strings" "time" @@ -56,36 +57,24 @@ func realtimeAPITable(realtimeAPI schema.APIResponse, env cliconfig.Environment) func realtimeAPIsTable(realtimeAPIs []schema.APIResponse, envNames []string) table.Table { rows := make([][]interface{}, 0, len(realtimeAPIs)) - var totalFailed int32 - var totalStale int32 - for i, realtimeAPI := range realtimeAPIs { lastUpdated := time.Unix(realtimeAPI.Spec.LastUpdated, 0) rows = append(rows, []interface{}{ envNames[i], realtimeAPI.Spec.Name, - realtimeAPI.Status.Message(), - realtimeAPI.Status.Updated.Ready, - realtimeAPI.Status.Stale.Ready, - realtimeAPI.Status.Requested, - realtimeAPI.Status.Updated.TotalFailed(), + fmt.Sprintf("%d/%d", realtimeAPI.Status.Ready, realtimeAPI.Status.Requested), + realtimeAPI.Status.UpToDate, libtime.SinceStr(&lastUpdated), }) - - totalFailed += realtimeAPI.Status.Updated.TotalFailed() - totalStale += realtimeAPI.Status.Stale.Ready } return table.Table{ Headers: []table.Header{ {Title: _titleEnvironment}, {Title: _titleRealtimeAPI}, - {Title: _titleStatus}, + {Title: _titleLive}, {Title: _titleUpToDate}, - {Title: _titleStale, Hidden: totalStale == 0}, - {Title: _titleRequested}, - {Title: _titleFailed, Hidden: totalFailed == 0}, - {Title: _titleLastupdated}, + {Title: _titleLastUpdated}, }, Rows: rows, } diff --git a/cli/cmd/lib_task_apis.go b/cli/cmd/lib_task_apis.go index 50575b8516..3bd0275caf 100644 --- a/cli/cmd/lib_task_apis.go +++ b/cli/cmd/lib_task_apis.go @@ -72,7 +72,7 @@ func taskAPIsTable(taskAPIs []schema.APIResponse, envNames []string) table.Table {Title: _titleTaskAPI}, {Title: _titleTaskJobCount}, {Title: _titleLatestTaskJobID}, - {Title: _titleLastupdated}, + {Title: _titleLastUpdated}, }, Rows: rows, } diff --git a/cli/cmd/lib_traffic_splitters.go b/cli/cmd/lib_traffic_splitters.go index 39c344038a..f89d350ded 100644 --- a/cli/cmd/lib_traffic_splitters.go +++ b/cli/cmd/lib_traffic_splitters.go @@ -17,6 +17,7 @@ limitations under the License. package cmd import ( + "fmt" "strings" "time" @@ -82,8 +83,8 @@ func trafficSplitTable(trafficSplitter schema.APIResponse, env cliconfig.Environ env.Name, apiName, api.Weight, - apiRes.Status.Message(), - apiRes.Status.Requested, + fmt.Sprintf("%d/%d", apiRes.Status.Ready, apiRes.Status.Requested), + apiRes.Status.UpToDate, libtime.SinceStr(&lastUpdated), }) } @@ -93,9 +94,9 @@ func trafficSplitTable(trafficSplitter schema.APIResponse, env cliconfig.Environ {Title: _titleEnvironment}, {Title: _titleAPIs}, {Title: _trafficSplitterWeights}, - {Title: _titleStatus}, - {Title: _titleRequested}, - {Title: _titleLastupdated}, + {Title: _titleLive}, + {Title: _titleUpToDate}, + {Title: _titleLastUpdated}, }, Rows: rows, }, nil @@ -127,7 +128,7 @@ func trafficSplitterListTable(trafficSplitter []schema.APIResponse, envNames []s {Title: _titleEnvironment}, {Title: _titleTrafficSplitter}, {Title: _titleAPIs}, - {Title: _titleLastupdated}, + {Title: _titleLastUpdated}, }, Rows: rows, } diff --git a/pkg/operator/resources/asyncapi/status.go b/pkg/operator/resources/asyncapi/status.go index 38e02329d0..ff7ff67530 100644 --- a/pkg/operator/resources/asyncapi/status.go +++ b/pkg/operator/resources/asyncapi/status.go @@ -42,8 +42,6 @@ type asyncResourceGroup struct { func GetStatus(apiName string) (*status.Status, error) { var apiDeployment *kapps.Deployment var gatewayDeployment *kapps.Deployment - var gatewayPods []kcore.Pod - var apiPods []kcore.Pod err := parallel.RunFirstErr( func() error { @@ -56,26 +54,6 @@ func GetStatus(apiName string) (*status.Status, error) { gatewayDeployment, err = config.K8s.GetDeployment(getGatewayK8sName(apiName)) return err }, - func() error { - var err error - gatewayPods, err = config.K8s.ListPodsByLabels( - map[string]string{ - "apiName": apiName, - "cortex.dev/async": "gateway", - }, - ) - return err - }, - func() error { - var err error - apiPods, err = config.K8s.ListPodsByLabels( - map[string]string{ - "apiName": apiName, - "cortex.dev/async": "api", - }, - ) - return err - }, ) if err != nil { return nil, err @@ -89,7 +67,7 @@ func GetStatus(apiName string) (*status.Status, error) { return nil, errors.ErrorUnexpected("unable to find gateway deployment", apiName) } - return apiStatus(apiDeployment, apiPods, gatewayDeployment, gatewayPods) + return apiStatus(apiDeployment), nil } func GetAllStatuses(deployments []kapps.Deployment, pods []kcore.Pod) ([]status.Status, error) { @@ -106,11 +84,7 @@ func GetAllStatuses(deployments []kapps.Deployment, pods []kcore.Pod) ([]status. return nil, errors.ErrorUnexpected("unable to find gateway deployment", apiName) } - st, err := apiStatus(k8sResources.APIDeployment, k8sResources.APIPods, k8sResources.GatewayDeployment, k8sResources.GatewayPods) - if err != nil { - return nil, err - } - statuses[i] = *st + statuses[i] = *apiStatus(k8sResources.APIDeployment) i++ } @@ -174,22 +148,14 @@ func groupResourcesByAPI(deployments []kapps.Deployment, pods []kcore.Pod) map[s return resourcesByAPI } -func apiStatus(apiDeployment *kapps.Deployment, apiPods []kcore.Pod, gatewayDeployment *kapps.Deployment, gatewayPods []kcore.Pod) (*status.Status, error) { - autoscalingSpec, err := userconfig.AutoscalingFromAnnotations(apiDeployment) - if err != nil { - return nil, err +func apiStatus(deployment *kapps.Deployment) *status.Status { + return &status.Status{ + APIName: deployment.Labels["apiName"], + APIID: deployment.Labels["apiID"], + Ready: deployment.Status.ReadyReplicas, + Requested: deployment.Status.Replicas, + UpToDate: deployment.Status.UpdatedReplicas, } - - apiReplicaCounts := getReplicaCounts(apiDeployment, apiPods) - gatewayReplicaCounts := getReplicaCounts(gatewayDeployment, gatewayPods) - - st := &status.Status{} - st.APIName = apiDeployment.Labels["apiName"] - st.APIID = apiDeployment.Labels["apiID"] - st.ReplicaCounts = apiReplicaCounts - st.Code = getStatusCode(apiReplicaCounts, gatewayReplicaCounts, autoscalingSpec.MinReplicas) - - return st, nil } func getStatusCode(apiCounts status.ReplicaCounts, gatewayCounts status.ReplicaCounts, apiMinReplicas int32) status.Code { diff --git a/pkg/operator/resources/realtimeapi/api.go b/pkg/operator/resources/realtimeapi/api.go index 256b253f8e..c9ddaf5957 100644 --- a/pkg/operator/resources/realtimeapi/api.go +++ b/pkg/operator/resources/realtimeapi/api.go @@ -175,8 +175,8 @@ func DeleteAPI(apiName string, keepCache bool) error { return nil } -func GetAllAPIs(pods []kcore.Pod, deployments []kapps.Deployment) ([]schema.APIResponse, error) { - statuses, err := GetAllStatuses(deployments, pods) +func GetAllAPIs(deployments []kapps.Deployment) ([]schema.APIResponse, error) { + statuses, err := GetAllStatuses(deployments) if err != nil { return nil, err } diff --git a/pkg/operator/resources/realtimeapi/status.go b/pkg/operator/resources/realtimeapi/status.go index a65716f35c..4c8eb5ac46 100644 --- a/pkg/operator/resources/realtimeapi/status.go +++ b/pkg/operator/resources/realtimeapi/status.go @@ -24,30 +24,15 @@ import ( "github.com/cortexlabs/cortex/pkg/consts" "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/k8s" - "github.com/cortexlabs/cortex/pkg/lib/parallel" "github.com/cortexlabs/cortex/pkg/types/status" - "github.com/cortexlabs/cortex/pkg/types/userconfig" "github.com/cortexlabs/cortex/pkg/workloads" kapps "k8s.io/api/apps/v1" kcore "k8s.io/api/core/v1" ) func GetStatus(apiName string) (*status.Status, error) { - var deployment *kapps.Deployment - var pods []kcore.Pod - - err := parallel.RunFirstErr( - func() error { - var err error - deployment, err = config.K8s.GetDeployment(workloads.K8sName(apiName)) - return err - }, - func() error { - var err error - pods, err = config.K8s.ListPodsByLabel("apiName", apiName) - return err - }, - ) + var err error + deployment, err := config.K8s.GetDeployment(workloads.K8sName(apiName)) if err != nil { return nil, err } @@ -56,17 +41,13 @@ func GetStatus(apiName string) (*status.Status, error) { return nil, errors.ErrorUnexpected("unable to find deployment", apiName) } - return apiStatus(deployment, pods) + return apiStatus(*deployment), nil } -func GetAllStatuses(deployments []kapps.Deployment, pods []kcore.Pod) ([]status.Status, error) { +func GetAllStatuses(deployments []kapps.Deployment) ([]status.Status, error) { statuses := make([]status.Status, len(deployments)) for i := range deployments { - st, err := apiStatus(&deployments[i], pods) - if err != nil { - return nil, err - } - statuses[i] = *st + statuses[i] = *apiStatus(deployments[i]) } sort.Slice(statuses, func(i, j int) bool { @@ -76,19 +57,14 @@ func GetAllStatuses(deployments []kapps.Deployment, pods []kcore.Pod) ([]status. return statuses, nil } -func apiStatus(deployment *kapps.Deployment, allPods []kcore.Pod) (*status.Status, error) { - autoscalingSpec, err := userconfig.AutoscalingFromAnnotations(deployment) - if err != nil { - return nil, err +func apiStatus(deployment kapps.Deployment) *status.Status { + return &status.Status{ + APIName: deployment.Labels["apiName"], + APIID: deployment.Labels["apiID"], + Ready: deployment.Status.ReadyReplicas, + Requested: deployment.Status.Replicas, + UpToDate: deployment.Status.UpdatedReplicas, } - - status := &status.Status{} - status.APIName = deployment.Labels["apiName"] - status.APIID = deployment.Labels["apiID"] - status.ReplicaCounts = getReplicaCounts(deployment, allPods) - status.Code = getStatusCode(&status.ReplicaCounts, autoscalingSpec.MinReplicas) - - return status, nil } func getReplicaCounts(deployment *kapps.Deployment, pods []kcore.Pod) status.ReplicaCounts { diff --git a/pkg/operator/resources/resources.go b/pkg/operator/resources/resources.go index 87069c2136..b9853c1993 100644 --- a/pkg/operator/resources/resources.go +++ b/pkg/operator/resources/resources.go @@ -308,14 +308,11 @@ func GetAPIs() ([]schema.APIResponse, error) { } } - var realtimeAPIPods []kcore.Pod var batchAPIPods []kcore.Pod var taskAPIPods []kcore.Pod var asyncAPIPods []kcore.Pod for _, pod := range pods { switch pod.Labels["apiKind"] { - case userconfig.RealtimeAPIKind.String(): - realtimeAPIPods = append(realtimeAPIPods, pod) case userconfig.BatchAPIKind.String(): batchAPIPods = append(batchAPIPods, pod) case userconfig.TaskAPIKind.String(): @@ -340,7 +337,7 @@ func GetAPIs() ([]schema.APIResponse, error) { } } - realtimeAPIList, err := realtimeapi.GetAllAPIs(realtimeAPIPods, realtimeAPIDeployments) + realtimeAPIList, err := realtimeapi.GetAllAPIs(realtimeAPIDeployments) if err != nil { return nil, err } diff --git a/pkg/types/status/status.go b/pkg/types/status/status.go index 6dad4e1992..9a95c16463 100644 --- a/pkg/types/status/status.go +++ b/pkg/types/status/status.go @@ -17,10 +17,11 @@ limitations under the License. package status type Status struct { - APIName string `json:"api_name"` - APIID string `json:"api_id"` - Code Code `json:"status_code"` - ReplicaCounts `json:"replica_counts"` + APIName string `json:"api_name"` + APIID string `json:"api_id"` + Ready int32 `json:"ready"` + Requested int32 `json:"requested"` + UpToDate int32 `json:"up_to_date"` } type ReplicaCounts struct { @@ -53,10 +54,6 @@ type WorkerCounts struct { Unknown int32 `json:"unknown,omitempty"` } -func (status *Status) Message() string { - return status.Code.Message() -} - func (src *SubReplicaCounts) TotalFailed() int32 { return src.Failed + src.ErrImagePull + src.Killed + src.KilledOOM + src.Stalled } From dd50f2b509df8d5f9f112bb843e61eab04ecc61d Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Thu, 15 Jul 2021 21:43:48 +0300 Subject: [PATCH 22/40] WIP on upgrading the statuses --- cli/cmd/get.go | 6 +- cli/cmd/lib_realtime_apis.go | 2 +- go.mod | 2 +- pkg/operator/endpoints/logs.go | 11 +++- pkg/operator/operator/k8s.go | 13 ++++ pkg/operator/resources/asyncapi/api.go | 30 ++++----- pkg/operator/resources/asyncapi/status.go | 66 ++++--------------- pkg/operator/resources/job/batchapi/api.go | 4 +- pkg/operator/resources/job/taskapi/api.go | 4 +- pkg/operator/resources/realtimeapi/api.go | 28 ++++---- pkg/operator/resources/realtimeapi/status.go | 14 +--- pkg/operator/resources/resources.go | 9 +-- pkg/operator/resources/trafficsplitter/api.go | 4 +- pkg/operator/schema/schema.go | 2 +- pkg/types/status/status.go | 27 ++++++-- 15 files changed, 103 insertions(+), 119 deletions(-) diff --git a/cli/cmd/get.go b/cli/cmd/get.go index e366b432e4..0a9b6f9be8 100644 --- a/cli/cmd/get.go +++ b/cli/cmd/get.go @@ -219,7 +219,11 @@ func getAPIsInAllEnvironments() (string, error) { if err == nil { for _, api := range apisRes { - switch api.Spec.Kind { + if api.Status == nil { + // TODO remove this once the status is present for all + continue + } + switch api.Status.APIKind { case userconfig.BatchAPIKind: allBatchAPIEnvs = append(allBatchAPIEnvs, env.Name) allBatchAPIs = append(allBatchAPIs, api) diff --git a/cli/cmd/lib_realtime_apis.go b/cli/cmd/lib_realtime_apis.go index 48fdab941c..01be8891c2 100644 --- a/cli/cmd/lib_realtime_apis.go +++ b/cli/cmd/lib_realtime_apis.go @@ -61,7 +61,7 @@ func realtimeAPIsTable(realtimeAPIs []schema.APIResponse, envNames []string) tab lastUpdated := time.Unix(realtimeAPI.Spec.LastUpdated, 0) rows = append(rows, []interface{}{ envNames[i], - realtimeAPI.Spec.Name, + realtimeAPI.Status.APIName, fmt.Sprintf("%d/%d", realtimeAPI.Status.Ready, realtimeAPI.Status.Requested), realtimeAPI.Status.UpToDate, libtime.SinceStr(&lastUpdated), diff --git a/go.mod b/go.mod index 6acb918587..a33a4c4903 100644 --- a/go.mod +++ b/go.mod @@ -67,7 +67,7 @@ require ( golang.org/x/time v0.0.0-20210611083556-38a9dc6acbc6 // indirect gomodules.xyz/jsonpatch/v2 v2.2.0 // indirect google.golang.org/genproto v0.0.0-20210701133433-6b8dcf568a95 // indirect - google.golang.org/grpc v1.39.0 // indirect + google.golang.org/grpc v1.39.0 gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f // indirect gopkg.in/karalabe/cookiejar.v2 v2.0.0-20150724131613-8dcd6a7f4951 gopkg.in/segmentio/analytics-go.v3 v3.1.0 diff --git a/pkg/operator/endpoints/logs.go b/pkg/operator/endpoints/logs.go index 2d335e27da..d56add3806 100644 --- a/pkg/operator/endpoints/logs.go +++ b/pkg/operator/endpoints/logs.go @@ -19,6 +19,7 @@ package endpoints import ( "net/http" + "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/operator/operator" "github.com/cortexlabs/cortex/pkg/operator/resources" "github.com/cortexlabs/cortex/pkg/operator/resources/asyncapi" @@ -98,7 +99,10 @@ func GetLogURL(w http.ResponseWriter, r *http.Request) { respondError(w, r, err) return } - logURL, err := operator.APILogURL(apiResponse[0].Spec) + if apiResponse[0].Spec == nil { + respondError(w, r, errors.ErrorUnexpected("unable to get api spec", apiName)) + } + logURL, err := operator.APILogURL(*apiResponse[0].Spec) if err != nil { respondError(w, r, err) return @@ -112,7 +116,10 @@ func GetLogURL(w http.ResponseWriter, r *http.Request) { respondError(w, r, err) return } - logURL, err := operator.APILogURL(apiResponse[0].Spec) + if apiResponse[0].Spec == nil { + respondError(w, r, errors.ErrorUnexpected("unable to get api spec", apiName)) + } + logURL, err := operator.APILogURL(*apiResponse[0].Spec) if err != nil { respondError(w, r, err) return diff --git a/pkg/operator/operator/k8s.go b/pkg/operator/operator/k8s.go index f9536596ce..b85cb81b9e 100644 --- a/pkg/operator/operator/k8s.go +++ b/pkg/operator/operator/k8s.go @@ -63,3 +63,16 @@ func APIEndpoint(api *spec.API) (string, error) { return urls.Join(baseAPIEndpoint, *api.Networking.Endpoint), nil } + +func APIEndpointFromPath(apiNetworkingPath string) (string, error) { + var err error + baseAPIEndpoint := "" + + baseAPIEndpoint, err = APILoadBalancerURL() + if err != nil { + return "", err + } + baseAPIEndpoint = strings.Replace(baseAPIEndpoint, "https://", "http://", 1) + + return urls.Join(baseAPIEndpoint, apiNetworkingPath), nil +} diff --git a/pkg/operator/resources/asyncapi/api.go b/pkg/operator/resources/asyncapi/api.go index 39cce27446..4666792ee4 100644 --- a/pkg/operator/resources/asyncapi/api.go +++ b/pkg/operator/resources/asyncapi/api.go @@ -269,7 +269,7 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp return []schema.APIResponse{ { - Spec: *api, + Spec: api, Status: status, Endpoint: apiEndpoint, DashboardURL: dashboardURL, @@ -277,29 +277,27 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp }, nil } -func GetAllAPIs(pods []kcore.Pod, deployments []kapps.Deployment) ([]schema.APIResponse, error) { - statuses, err := GetAllStatuses(deployments, pods) +func GetAllAPIs(deployments []kapps.Deployment) ([]schema.APIResponse, error) { + statuses, err := GetAllStatuses(deployments) if err != nil { return nil, err } - apiNames, apiIDs := namesAndIDsFromStatuses(statuses) - apis, err := operator.DownloadAPISpecs(apiNames, apiIDs) - if err != nil { - return nil, err - } - - asyncAPIs := make([]schema.APIResponse, len(apis)) + asyncAPIs := make([]schema.APIResponse, len(statuses)) - for i := range apis { - api := apis[i] - endpoint, err := operator.APIEndpoint(&api) - if err != nil { - return nil, err + for i := range statuses { + var endpoint string + for _, deployment := range deployments { + if deployment.Labels["apiName"] == statuses[i].APIName { + endpoint, err = operator.APIEndpointFromPath(deployment.Annotations[userconfig.EndpointAnnotationKey]) + if err != nil { + return nil, err + } + break + } } asyncAPIs[i] = schema.APIResponse{ - Spec: api, Status: &statuses[i], Endpoint: endpoint, } diff --git a/pkg/operator/resources/asyncapi/status.go b/pkg/operator/resources/asyncapi/status.go index ff7ff67530..37f29b36da 100644 --- a/pkg/operator/resources/asyncapi/status.go +++ b/pkg/operator/resources/asyncapi/status.go @@ -32,11 +32,9 @@ import ( kcore "k8s.io/api/core/v1" ) -type asyncResourceGroup struct { +type asyncDeployments struct { APIDeployment *kapps.Deployment - APIPods []kcore.Pod GatewayDeployment *kapps.Deployment - GatewayPods []kcore.Pod } func GetStatus(apiName string) (*status.Status, error) { @@ -67,15 +65,15 @@ func GetStatus(apiName string) (*status.Status, error) { return nil, errors.ErrorUnexpected("unable to find gateway deployment", apiName) } - return apiStatus(apiDeployment), nil + return status.StatusFromDeployment(apiDeployment), nil } -func GetAllStatuses(deployments []kapps.Deployment, pods []kcore.Pod) ([]status.Status, error) { - resourcesByAPI := groupResourcesByAPI(deployments, pods) - statuses := make([]status.Status, len(resourcesByAPI)) +func GetAllStatuses(deployments []kapps.Deployment) ([]status.Status, error) { + deploymentsByAPI := groupDeploymentsByAPI(deployments) + statuses := make([]status.Status, len(deploymentsByAPI)) var i int - for apiName, k8sResources := range resourcesByAPI { + for apiName, k8sResources := range deploymentsByAPI { if k8sResources.APIDeployment == nil { return nil, errors.ErrorUnexpected("unable to find api deployment", apiName) } @@ -84,7 +82,7 @@ func GetAllStatuses(deployments []kapps.Deployment, pods []kcore.Pod) ([]status. return nil, errors.ErrorUnexpected("unable to find gateway deployment", apiName) } - statuses[i] = *apiStatus(k8sResources.APIDeployment) + statuses[i] = *status.StatusFromDeployment(k8sResources.APIDeployment) i++ } @@ -95,26 +93,14 @@ func GetAllStatuses(deployments []kapps.Deployment, pods []kcore.Pod) ([]status. return statuses, nil } -func namesAndIDsFromStatuses(statuses []status.Status) ([]string, []string) { - apiNames := make([]string, len(statuses)) - apiIDs := make([]string, len(statuses)) - - for i, st := range statuses { - apiNames[i] = st.APIName - apiIDs[i] = st.APIID - } - - return apiNames, apiIDs -} - // let's do CRDs instead, to avoid this -func groupResourcesByAPI(deployments []kapps.Deployment, pods []kcore.Pod) map[string]*asyncResourceGroup { - resourcesByAPI := map[string]*asyncResourceGroup{} +func groupDeploymentsByAPI(deployments []kapps.Deployment) map[string]*asyncDeployments { + deploymentsByAPI := map[string]*asyncDeployments{} for i := range deployments { deployment := deployments[i] apiName := deployment.Labels["apiName"] asyncType := deployment.Labels["cortex.dev/async"] - apiResources, exists := resourcesByAPI[apiName] + apiResources, exists := deploymentsByAPI[apiName] if exists { if asyncType == "api" { apiResources.APIDeployment = &deployment @@ -123,39 +109,13 @@ func groupResourcesByAPI(deployments []kapps.Deployment, pods []kcore.Pod) map[s } } else { if asyncType == "api" { - resourcesByAPI[apiName] = &asyncResourceGroup{APIDeployment: &deployment} + deploymentsByAPI[apiName] = &asyncDeployments{APIDeployment: &deployment} } else { - resourcesByAPI[apiName] = &asyncResourceGroup{GatewayDeployment: &deployment} + deploymentsByAPI[apiName] = &asyncDeployments{GatewayDeployment: &deployment} } } } - - for _, pod := range pods { - apiName := pod.Labels["apiName"] - asyncType := pod.Labels["cortex.dev/async"] - apiResources, exists := resourcesByAPI[apiName] - if !exists { - // ignore pods that might still be waiting to be deleted while the deployment has already been deleted - continue - } - - if asyncType == "api" { - apiResources.APIPods = append(resourcesByAPI[apiName].APIPods, pod) - } else { - apiResources.GatewayPods = append(resourcesByAPI[apiName].GatewayPods, pod) - } - } - return resourcesByAPI -} - -func apiStatus(deployment *kapps.Deployment) *status.Status { - return &status.Status{ - APIName: deployment.Labels["apiName"], - APIID: deployment.Labels["apiID"], - Ready: deployment.Status.ReadyReplicas, - Requested: deployment.Status.Replicas, - UpToDate: deployment.Status.UpdatedReplicas, - } + return deploymentsByAPI } func getStatusCode(apiCounts status.ReplicaCounts, gatewayCounts status.ReplicaCounts, apiMinReplicas int32) status.Code { diff --git a/pkg/operator/resources/job/batchapi/api.go b/pkg/operator/resources/job/batchapi/api.go index b85726a531..0570ae8124 100644 --- a/pkg/operator/resources/job/batchapi/api.go +++ b/pkg/operator/resources/job/batchapi/api.go @@ -184,7 +184,7 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, batchJob } batchAPIsMap[apiName] = &schema.APIResponse{ - Spec: *api, + Spec: api, Endpoint: endpoint, BatchJobStatuses: jobStatuses, } @@ -263,7 +263,7 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp return []schema.APIResponse{ { - Spec: *api, + Spec: api, BatchJobStatuses: jobStatuses, Endpoint: endpoint, DashboardURL: dashboardURL, diff --git a/pkg/operator/resources/job/taskapi/api.go b/pkg/operator/resources/job/taskapi/api.go index 9261cc16a9..535259c1b3 100644 --- a/pkg/operator/resources/job/taskapi/api.go +++ b/pkg/operator/resources/job/taskapi/api.go @@ -173,7 +173,7 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, k8sJobs } taskAPIsMap[apiName] = &schema.APIResponse{ - Spec: *api, + Spec: api, Endpoint: endpoint, TaskJobStatuses: jobStatuses, } @@ -295,7 +295,7 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp return []schema.APIResponse{ { - Spec: *api, + Spec: api, TaskJobStatuses: jobStatuses, Endpoint: endpoint, DashboardURL: dashboardURL, diff --git a/pkg/operator/resources/realtimeapi/api.go b/pkg/operator/resources/realtimeapi/api.go index c9ddaf5957..9f675063fd 100644 --- a/pkg/operator/resources/realtimeapi/api.go +++ b/pkg/operator/resources/realtimeapi/api.go @@ -181,23 +181,21 @@ func GetAllAPIs(deployments []kapps.Deployment) ([]schema.APIResponse, error) { return nil, err } - apiNames, apiIDs := namesAndIDsFromStatuses(statuses) - apis, err := operator.DownloadAPISpecs(apiNames, apiIDs) - if err != nil { - return nil, err - } - - realtimeAPIs := make([]schema.APIResponse, len(apis)) - - for i := range apis { - api := apis[i] - endpoint, err := operator.APIEndpoint(&api) - if err != nil { - return nil, err + realtimeAPIs := make([]schema.APIResponse, len(statuses)) + + for i := range statuses { + var endpoint string + for _, deployment := range deployments { + if deployment.Labels["apiName"] == statuses[i].APIName { + endpoint, err = operator.APIEndpointFromPath(deployment.Annotations[userconfig.EndpointAnnotationKey]) + if err != nil { + return nil, err + } + break + } } realtimeAPIs[i] = schema.APIResponse{ - Spec: api, Status: &statuses[i], Endpoint: endpoint, } @@ -238,7 +236,7 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp return []schema.APIResponse{ { - Spec: *api, + Spec: api, Status: st, Endpoint: apiEndpoint, DashboardURL: dashboardURL, diff --git a/pkg/operator/resources/realtimeapi/status.go b/pkg/operator/resources/realtimeapi/status.go index 4c8eb5ac46..9b815c66bd 100644 --- a/pkg/operator/resources/realtimeapi/status.go +++ b/pkg/operator/resources/realtimeapi/status.go @@ -41,13 +41,13 @@ func GetStatus(apiName string) (*status.Status, error) { return nil, errors.ErrorUnexpected("unable to find deployment", apiName) } - return apiStatus(*deployment), nil + return status.StatusFromDeployment(deployment), nil } func GetAllStatuses(deployments []kapps.Deployment) ([]status.Status, error) { statuses := make([]status.Status, len(deployments)) for i := range deployments { - statuses[i] = *apiStatus(deployments[i]) + statuses[i] = *status.StatusFromDeployment(&deployments[i]) } sort.Slice(statuses, func(i, j int) bool { @@ -57,16 +57,6 @@ func GetAllStatuses(deployments []kapps.Deployment) ([]status.Status, error) { return statuses, nil } -func apiStatus(deployment kapps.Deployment) *status.Status { - return &status.Status{ - APIName: deployment.Labels["apiName"], - APIID: deployment.Labels["apiID"], - Ready: deployment.Status.ReadyReplicas, - Requested: deployment.Status.Replicas, - UpToDate: deployment.Status.UpdatedReplicas, - } -} - func getReplicaCounts(deployment *kapps.Deployment, pods []kcore.Pod) status.ReplicaCounts { counts := status.ReplicaCounts{} counts.Requested = *deployment.Spec.Replicas diff --git a/pkg/operator/resources/resources.go b/pkg/operator/resources/resources.go index b9853c1993..768122d69d 100644 --- a/pkg/operator/resources/resources.go +++ b/pkg/operator/resources/resources.go @@ -158,7 +158,7 @@ func UpdateAPI(apiConfig *userconfig.API, force bool) (*schema.APIResponse, stri apiEndpoint, _ := operator.APIEndpoint(api) return &schema.APIResponse{ - Spec: *api, + Spec: api, Endpoint: apiEndpoint, }, msg, nil } @@ -310,15 +310,12 @@ func GetAPIs() ([]schema.APIResponse, error) { var batchAPIPods []kcore.Pod var taskAPIPods []kcore.Pod - var asyncAPIPods []kcore.Pod for _, pod := range pods { switch pod.Labels["apiKind"] { case userconfig.BatchAPIKind.String(): batchAPIPods = append(batchAPIPods, pod) case userconfig.TaskAPIKind.String(): taskAPIPods = append(taskAPIPods, pod) - case userconfig.AsyncAPIKind.String(): - asyncAPIPods = append(asyncAPIPods, pod) } } @@ -353,7 +350,7 @@ func GetAPIs() ([]schema.APIResponse, error) { return nil, err } - asyncAPIList, err := asyncapi.GetAllAPIs(asyncAPIPods, asyncAPIDeployments) + asyncAPIList, err := asyncapi.GetAllAPIs(asyncAPIDeployments) if err != nil { return nil, err } @@ -446,7 +443,7 @@ func GetAPIByID(apiName string, apiID string) ([]schema.APIResponse, error) { return []schema.APIResponse{ { - Spec: *apiSpec, + Spec: apiSpec, }, }, nil } diff --git a/pkg/operator/resources/trafficsplitter/api.go b/pkg/operator/resources/trafficsplitter/api.go index 9d81a17faa..a1fba2b504 100644 --- a/pkg/operator/resources/trafficsplitter/api.go +++ b/pkg/operator/resources/trafficsplitter/api.go @@ -158,7 +158,7 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService) ([]schem } trafficSplitters = append(trafficSplitters, schema.APIResponse{ - Spec: trafficSplitter, + Spec: &trafficSplitter, Endpoint: endpoint, }) } @@ -180,7 +180,7 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp return []schema.APIResponse{ { - Spec: *api, + Spec: api, Endpoint: endpoint, }, }, nil diff --git a/pkg/operator/schema/schema.go b/pkg/operator/schema/schema.go index eff68701ee..401b59a30c 100644 --- a/pkg/operator/schema/schema.go +++ b/pkg/operator/schema/schema.go @@ -56,7 +56,7 @@ type DeployResult struct { } type APIResponse struct { - Spec spec.API `json:"spec"` + Spec *spec.API `json:"spec,omitempty"` Status *status.Status `json:"status,omitempty"` Endpoint string `json:"endpoint"` DashboardURL *string `json:"dashboard_url,omitempty"` diff --git a/pkg/types/status/status.go b/pkg/types/status/status.go index 9a95c16463..b2299bce5c 100644 --- a/pkg/types/status/status.go +++ b/pkg/types/status/status.go @@ -16,12 +16,18 @@ limitations under the License. package status +import ( + "github.com/cortexlabs/cortex/pkg/types/userconfig" + kapps "k8s.io/api/apps/v1" +) + type Status struct { - APIName string `json:"api_name"` - APIID string `json:"api_id"` - Ready int32 `json:"ready"` - Requested int32 `json:"requested"` - UpToDate int32 `json:"up_to_date"` + APIName string `json:"api_name"` + APIKind userconfig.Kind `json:"api_kind"` + APIID string `json:"api_id"` + Ready int32 `json:"ready"` + Requested int32 `json:"requested"` + UpToDate int32 `json:"up_to_date"` } type ReplicaCounts struct { @@ -54,6 +60,17 @@ type WorkerCounts struct { Unknown int32 `json:"unknown,omitempty"` } +func StatusFromDeployment(deployment *kapps.Deployment) *Status { + return &Status{ + APIName: deployment.Labels["apiName"], + APIKind: userconfig.KindFromString(deployment.Labels["apiKind"]), + APIID: deployment.Labels["apiID"], + Ready: deployment.Status.ReadyReplicas, + Requested: deployment.Status.Replicas, + UpToDate: deployment.Status.UpdatedReplicas, + } +} + func (src *SubReplicaCounts) TotalFailed() int32 { return src.Failed + src.ErrImagePull + src.Killed + src.KilledOOM + src.Stalled } From 4f9dc7c47020bf3a92fe0f6e1327d1d66f73ebe8 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Wed, 21 Jul 2021 14:59:58 +0300 Subject: [PATCH 23/40] WIP on API statuses --- cli/cmd/get.go | 4 +- cli/cmd/lib_async_apis.go | 4 +- cli/cmd/lib_batch_apis.go | 4 +- cli/cmd/lib_realtime_apis.go | 16 ++- cli/cmd/lib_task_apis.go | 4 +- cli/cmd/lib_traffic_splitters.go | 5 +- pkg/operator/resources/asyncapi/api.go | 104 ++++++++++++------ pkg/operator/resources/asyncapi/status.go | 60 ---------- pkg/operator/resources/job/batchapi/api.go | 4 +- pkg/operator/resources/job/taskapi/api.go | 4 +- pkg/operator/resources/realtimeapi/api.go | 70 ++++++------ pkg/operator/resources/realtimeapi/status.go | 31 ------ pkg/operator/resources/resources.go | 27 ++--- pkg/operator/resources/trafficsplitter/api.go | 4 +- pkg/operator/schema/schema.go | 3 +- pkg/types/spec/api.go | 24 ++++ pkg/types/status/status.go | 13 +-- 17 files changed, 177 insertions(+), 204 deletions(-) diff --git a/cli/cmd/get.go b/cli/cmd/get.go index 0a9b6f9be8..f2356fb69a 100644 --- a/cli/cmd/get.go +++ b/cli/cmd/get.go @@ -219,11 +219,11 @@ func getAPIsInAllEnvironments() (string, error) { if err == nil { for _, api := range apisRes { - if api.Status == nil { + if api.Metadata == nil { // TODO remove this once the status is present for all continue } - switch api.Status.APIKind { + switch api.Metadata.Kind { case userconfig.BatchAPIKind: allBatchAPIEnvs = append(allBatchAPIEnvs, env.Name) allBatchAPIs = append(allBatchAPIs, api) diff --git a/cli/cmd/lib_async_apis.go b/cli/cmd/lib_async_apis.go index 3b0347e51c..d810acad62 100644 --- a/cli/cmd/lib_async_apis.go +++ b/cli/cmd/lib_async_apis.go @@ -41,7 +41,9 @@ func asyncAPITable(asyncAPI schema.APIResponse, env cliconfig.Environment) (stri out += "\n" + console.Bold("metrics dashboard: ") + *asyncAPI.DashboardURL + "\n" } - out += "\n" + console.Bold("endpoint: ") + asyncAPI.Endpoint + "\n" + if asyncAPI.Endpoint != nil { + out += "\n" + console.Bold("endpoint: ") + *asyncAPI.Endpoint + "\n" + } out += "\n" + apiHistoryTable(asyncAPI.APIVersions) diff --git a/cli/cmd/lib_batch_apis.go b/cli/cmd/lib_batch_apis.go index ac43150a53..1499ac8869 100644 --- a/cli/cmd/lib_batch_apis.go +++ b/cli/cmd/lib_batch_apis.go @@ -123,7 +123,9 @@ func batchAPITable(batchAPI schema.APIResponse) string { out += "\n" + console.Bold("metrics dashboard: ") + *batchAPI.DashboardURL + "\n" } - out += "\n" + console.Bold("endpoint: ") + batchAPI.Endpoint + "\n" + if batchAPI.Endpoint != nil { + out += "\n" + console.Bold("endpoint: ") + *batchAPI.Endpoint + "\n" + } out += "\n" + apiHistoryTable(batchAPI.APIVersions) diff --git a/cli/cmd/lib_realtime_apis.go b/cli/cmd/lib_realtime_apis.go index 01be8891c2..36abdaff87 100644 --- a/cli/cmd/lib_realtime_apis.go +++ b/cli/cmd/lib_realtime_apis.go @@ -32,8 +32,9 @@ func realtimeAPITable(realtimeAPI schema.APIResponse, env cliconfig.Environment) var out string t := realtimeAPIsTable([]schema.APIResponse{realtimeAPI}, []string{env.Name}) - t.FindHeaderByTitle(_titleEnvironment).Hidden = true - t.FindHeaderByTitle(_titleRealtimeAPI).Hidden = true + // TODO decide on whether we want to keep this consistent with `cortex get` command + // t.FindHeaderByTitle(_titleEnvironment).Hidden = true + // t.FindHeaderByTitle(_titleRealtimeAPI).Hidden = true out += t.MustFormat() @@ -41,7 +42,9 @@ func realtimeAPITable(realtimeAPI schema.APIResponse, env cliconfig.Environment) out += "\n" + console.Bold("metrics dashboard: ") + *realtimeAPI.DashboardURL + "\n" } - out += "\n" + console.Bold("endpoint: ") + realtimeAPI.Endpoint + "\n" + if realtimeAPI.Endpoint != nil { + out += "\n" + console.Bold("endpoint: ") + *realtimeAPI.Endpoint + "\n" + } out += "\n" + apiHistoryTable(realtimeAPI.APIVersions) @@ -58,10 +61,13 @@ func realtimeAPIsTable(realtimeAPIs []schema.APIResponse, envNames []string) tab rows := make([][]interface{}, 0, len(realtimeAPIs)) for i, realtimeAPI := range realtimeAPIs { - lastUpdated := time.Unix(realtimeAPI.Spec.LastUpdated, 0) + if realtimeAPI.Metadata == nil || realtimeAPI.Status == nil { + continue + } + lastUpdated := time.Unix(realtimeAPI.Metadata.LastUpdated, 0) rows = append(rows, []interface{}{ envNames[i], - realtimeAPI.Status.APIName, + realtimeAPI.Metadata.Name, fmt.Sprintf("%d/%d", realtimeAPI.Status.Ready, realtimeAPI.Status.Requested), realtimeAPI.Status.UpToDate, libtime.SinceStr(&lastUpdated), diff --git a/cli/cmd/lib_task_apis.go b/cli/cmd/lib_task_apis.go index 3bd0275caf..a639cee5f0 100644 --- a/cli/cmd/lib_task_apis.go +++ b/cli/cmd/lib_task_apis.go @@ -118,7 +118,9 @@ func taskAPITable(taskAPI schema.APIResponse) string { out += "\n" + console.Bold("metrics dashboard: ") + *taskAPI.DashboardURL + "\n" } - out += "\n" + console.Bold("endpoint: ") + taskAPI.Endpoint + "\n" + if taskAPI.Endpoint != nil { + out += "\n" + console.Bold("endpoint: ") + *taskAPI.Endpoint + "\n" + } out += "\n" + apiHistoryTable(taskAPI.APIVersions) diff --git a/cli/cmd/lib_traffic_splitters.go b/cli/cmd/lib_traffic_splitters.go index f89d350ded..cc23e00a87 100644 --- a/cli/cmd/lib_traffic_splitters.go +++ b/cli/cmd/lib_traffic_splitters.go @@ -50,7 +50,10 @@ func trafficSplitterTable(trafficSplitter schema.APIResponse, env cliconfig.Envi out += t.MustFormat() out += "\n" + console.Bold("last updated: ") + libtime.SinceStr(&lastUpdated) - out += "\n" + console.Bold("endpoint: ") + trafficSplitter.Endpoint + "\n" + + if trafficSplitter.Endpoint != nil { + out += "\n" + console.Bold("endpoint: ") + *trafficSplitter.Endpoint + "\n" + } out += "\n" + apiHistoryTable(trafficSplitter.APIVersions) diff --git a/pkg/operator/resources/asyncapi/api.go b/pkg/operator/resources/asyncapi/api.go index 4666792ee4..4c93657a3c 100644 --- a/pkg/operator/resources/asyncapi/api.go +++ b/pkg/operator/resources/asyncapi/api.go @@ -19,6 +19,7 @@ package asyncapi import ( "fmt" "path/filepath" + "sort" "time" "github.com/cortexlabs/cortex/pkg/config" @@ -31,6 +32,7 @@ import ( "github.com/cortexlabs/cortex/pkg/operator/operator" "github.com/cortexlabs/cortex/pkg/operator/schema" "github.com/cortexlabs/cortex/pkg/types/spec" + "github.com/cortexlabs/cortex/pkg/types/status" "github.com/cortexlabs/cortex/pkg/types/userconfig" "github.com/cortexlabs/cortex/pkg/workloads" istioclientnetworking "istio.io/client-go/pkg/apis/networking/v1beta1" @@ -249,13 +251,77 @@ func DeleteAPI(apiName string, keepCache bool) error { return nil } +func GetAllAPIs(deployments []kapps.Deployment, virtualServices []istioclientnetworking.VirtualService) ([]schema.APIResponse, error) { + asyncAPIs := make([]schema.APIResponse, len(deployments)) + mappedAsyncAPIs := make(map[string]schema.APIResponse, len(deployments)) + keys := make([]string, len(deployments)) + + for i := range deployments { + apiName := deployments[i].Labels["apiName"] + keys = append(keys, apiName) + + metadata, err := spec.MetadataFromDeployment(&deployments[i]) + if err != nil { + return nil, errors.Wrap(err, fmt.Sprintf("api %s", apiName)) + } + mappedAsyncAPIs[apiName] = schema.APIResponse{ + Status: status.StatusFromDeployment(&deployments[i]), + Metadata: metadata, + } + } + + sort.Strings(keys) + for _, apiName := range keys { + asyncAPIs = append(asyncAPIs, mappedAsyncAPIs[apiName]) + } + + return asyncAPIs, nil +} + func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResponse, error) { - status, err := GetStatus(deployedResource.Name) + var apiDeployment *kapps.Deployment + var gatewayDeployment *kapps.Deployment + + err := parallel.RunFirstErr( + func() error { + var err error + apiDeployment, err = config.K8s.GetDeployment(workloads.K8sName(deployedResource.Name)) + return err + }, + func() error { + var err error + gatewayDeployment, err = config.K8s.GetDeployment(getGatewayK8sName(deployedResource.Name)) + return err + }, + ) if err != nil { return nil, err } - api, err := operator.DownloadAPISpec(status.APIName, status.APIID) + if apiDeployment == nil { + return nil, errors.ErrorUnexpected("unable to find api deployment", deployedResource.Name) + } + + if gatewayDeployment == nil { + return nil, errors.ErrorUnexpected("unable to find gateway deployment", deployedResource.Name) + } + + deployment, err := config.K8s.GetDeployment(workloads.K8sName(deployedResource.Name)) + if err != nil { + return nil, err + } + + if deployment == nil { + return nil, errors.ErrorUnexpected("unable to find deployment", deployedResource.Name) + } + + apiStatus := status.StatusFromDeployment(deployment) + apiMetadata, err := spec.MetadataFromDeployment(deployment) + if err != nil { + return nil, errors.ErrorUnexpected("unable to obtain metadata", deployedResource.Name) + } + + api, err := operator.DownloadAPISpec(apiMetadata.Name, apiMetadata.APIID) if err != nil { return nil, err } @@ -270,42 +336,14 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp return []schema.APIResponse{ { Spec: api, - Status: status, - Endpoint: apiEndpoint, + Metadata: apiMetadata, + Status: apiStatus, + Endpoint: &apiEndpoint, DashboardURL: dashboardURL, }, }, nil } -func GetAllAPIs(deployments []kapps.Deployment) ([]schema.APIResponse, error) { - statuses, err := GetAllStatuses(deployments) - if err != nil { - return nil, err - } - - asyncAPIs := make([]schema.APIResponse, len(statuses)) - - for i := range statuses { - var endpoint string - for _, deployment := range deployments { - if deployment.Labels["apiName"] == statuses[i].APIName { - endpoint, err = operator.APIEndpointFromPath(deployment.Annotations[userconfig.EndpointAnnotationKey]) - if err != nil { - return nil, err - } - break - } - } - - asyncAPIs[i] = schema.APIResponse{ - Status: &statuses[i], - Endpoint: endpoint, - } - } - - return asyncAPIs, nil -} - func UpdateAPIMetricsCron(apiDeployment *kapps.Deployment) error { apiName := apiDeployment.Labels["apiName"] diff --git a/pkg/operator/resources/asyncapi/status.go b/pkg/operator/resources/asyncapi/status.go index 37f29b36da..48189a89cb 100644 --- a/pkg/operator/resources/asyncapi/status.go +++ b/pkg/operator/resources/asyncapi/status.go @@ -17,17 +17,13 @@ limitations under the License. package asyncapi import ( - "sort" "time" "github.com/cortexlabs/cortex/pkg/config" "github.com/cortexlabs/cortex/pkg/consts" - "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/k8s" - "github.com/cortexlabs/cortex/pkg/lib/parallel" "github.com/cortexlabs/cortex/pkg/types/status" "github.com/cortexlabs/cortex/pkg/types/userconfig" - "github.com/cortexlabs/cortex/pkg/workloads" kapps "k8s.io/api/apps/v1" kcore "k8s.io/api/core/v1" ) @@ -37,62 +33,6 @@ type asyncDeployments struct { GatewayDeployment *kapps.Deployment } -func GetStatus(apiName string) (*status.Status, error) { - var apiDeployment *kapps.Deployment - var gatewayDeployment *kapps.Deployment - - err := parallel.RunFirstErr( - func() error { - var err error - apiDeployment, err = config.K8s.GetDeployment(workloads.K8sName(apiName)) - return err - }, - func() error { - var err error - gatewayDeployment, err = config.K8s.GetDeployment(getGatewayK8sName(apiName)) - return err - }, - ) - if err != nil { - return nil, err - } - - if apiDeployment == nil { - return nil, errors.ErrorUnexpected("unable to find api deployment", apiName) - } - - if gatewayDeployment == nil { - return nil, errors.ErrorUnexpected("unable to find gateway deployment", apiName) - } - - return status.StatusFromDeployment(apiDeployment), nil -} - -func GetAllStatuses(deployments []kapps.Deployment) ([]status.Status, error) { - deploymentsByAPI := groupDeploymentsByAPI(deployments) - statuses := make([]status.Status, len(deploymentsByAPI)) - - var i int - for apiName, k8sResources := range deploymentsByAPI { - if k8sResources.APIDeployment == nil { - return nil, errors.ErrorUnexpected("unable to find api deployment", apiName) - } - - if k8sResources.GatewayDeployment == nil { - return nil, errors.ErrorUnexpected("unable to find gateway deployment", apiName) - } - - statuses[i] = *status.StatusFromDeployment(k8sResources.APIDeployment) - i++ - } - - sort.Slice(statuses, func(i, j int) bool { - return statuses[i].APIName < statuses[j].APIName - }) - - return statuses, nil -} - // let's do CRDs instead, to avoid this func groupDeploymentsByAPI(deployments []kapps.Deployment) map[string]*asyncDeployments { deploymentsByAPI := map[string]*asyncDeployments{} diff --git a/pkg/operator/resources/job/batchapi/api.go b/pkg/operator/resources/job/batchapi/api.go index 0570ae8124..6b230847c6 100644 --- a/pkg/operator/resources/job/batchapi/api.go +++ b/pkg/operator/resources/job/batchapi/api.go @@ -185,7 +185,7 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, batchJob batchAPIsMap[apiName] = &schema.APIResponse{ Spec: api, - Endpoint: endpoint, + Endpoint: &endpoint, BatchJobStatuses: jobStatuses, } } @@ -265,7 +265,7 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp { Spec: api, BatchJobStatuses: jobStatuses, - Endpoint: endpoint, + Endpoint: &endpoint, DashboardURL: dashboardURL, }, }, nil diff --git a/pkg/operator/resources/job/taskapi/api.go b/pkg/operator/resources/job/taskapi/api.go index 535259c1b3..32aa249636 100644 --- a/pkg/operator/resources/job/taskapi/api.go +++ b/pkg/operator/resources/job/taskapi/api.go @@ -174,7 +174,7 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, k8sJobs taskAPIsMap[apiName] = &schema.APIResponse{ Spec: api, - Endpoint: endpoint, + Endpoint: &endpoint, TaskJobStatuses: jobStatuses, } } @@ -297,7 +297,7 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp { Spec: api, TaskJobStatuses: jobStatuses, - Endpoint: endpoint, + Endpoint: &endpoint, DashboardURL: dashboardURL, }, }, nil diff --git a/pkg/operator/resources/realtimeapi/api.go b/pkg/operator/resources/realtimeapi/api.go index 9f675063fd..14d24b3e04 100644 --- a/pkg/operator/resources/realtimeapi/api.go +++ b/pkg/operator/resources/realtimeapi/api.go @@ -19,6 +19,7 @@ package realtimeapi import ( "fmt" "path/filepath" + "sort" "time" "github.com/cortexlabs/cortex/pkg/config" @@ -175,54 +176,50 @@ func DeleteAPI(apiName string, keepCache bool) error { return nil } -func GetAllAPIs(deployments []kapps.Deployment) ([]schema.APIResponse, error) { - statuses, err := GetAllStatuses(deployments) - if err != nil { - return nil, err - } +func GetAllAPIs(deployments []kapps.Deployment, virtualServices []istioclientnetworking.VirtualService) ([]schema.APIResponse, error) { + realtimeAPIs := make([]schema.APIResponse, len(deployments)) + mappedRealtimeAPIs := make(map[string]schema.APIResponse, len(deployments)) + keys := make([]string, len(deployments)) - realtimeAPIs := make([]schema.APIResponse, len(statuses)) - - for i := range statuses { - var endpoint string - for _, deployment := range deployments { - if deployment.Labels["apiName"] == statuses[i].APIName { - endpoint, err = operator.APIEndpointFromPath(deployment.Annotations[userconfig.EndpointAnnotationKey]) - if err != nil { - return nil, err - } - break - } - } + for i := range deployments { + apiName := deployments[i].Labels["apiName"] + keys = append(keys, apiName) - realtimeAPIs[i] = schema.APIResponse{ - Status: &statuses[i], - Endpoint: endpoint, + metadata, err := spec.MetadataFromDeployment(&deployments[i]) + if err != nil { + return nil, errors.Wrap(err, fmt.Sprintf("api %s", apiName)) + } + mappedRealtimeAPIs[apiName] = schema.APIResponse{ + Status: status.StatusFromDeployment(&deployments[i]), + Metadata: metadata, } } - return realtimeAPIs, nil -} - -func namesAndIDsFromStatuses(statuses []status.Status) ([]string, []string) { - apiNames := make([]string, len(statuses)) - apiIDs := make([]string, len(statuses)) - - for i, st := range statuses { - apiNames[i] = st.APIName - apiIDs[i] = st.APIID + sort.Strings(keys) + for _, apiName := range keys { + realtimeAPIs = append(realtimeAPIs, mappedRealtimeAPIs[apiName]) } - return apiNames, apiIDs + return realtimeAPIs, nil } func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResponse, error) { - st, err := GetStatus(deployedResource.Name) + deployment, err := config.K8s.GetDeployment(workloads.K8sName(deployedResource.Name)) if err != nil { return nil, err } - api, err := operator.DownloadAPISpec(st.APIName, st.APIID) + if deployment == nil { + return nil, errors.ErrorUnexpected("unable to find deployment", deployedResource.Name) + } + + apiStatus := status.StatusFromDeployment(deployment) + apiMetadata, err := spec.MetadataFromDeployment(deployment) + if err != nil { + return nil, errors.ErrorUnexpected("unable to obtain metadata", deployedResource.Name) + } + + api, err := operator.DownloadAPISpec(apiMetadata.Name, apiMetadata.APIID) if err != nil { return nil, err } @@ -237,8 +234,9 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp return []schema.APIResponse{ { Spec: api, - Status: st, - Endpoint: apiEndpoint, + Metadata: apiMetadata, + Status: apiStatus, + Endpoint: &apiEndpoint, DashboardURL: dashboardURL, }, }, nil diff --git a/pkg/operator/resources/realtimeapi/status.go b/pkg/operator/resources/realtimeapi/status.go index 9b815c66bd..9952ccd9ca 100644 --- a/pkg/operator/resources/realtimeapi/status.go +++ b/pkg/operator/resources/realtimeapi/status.go @@ -17,46 +17,15 @@ limitations under the License. package realtimeapi import ( - "sort" "time" - "github.com/cortexlabs/cortex/pkg/config" "github.com/cortexlabs/cortex/pkg/consts" - "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/k8s" "github.com/cortexlabs/cortex/pkg/types/status" - "github.com/cortexlabs/cortex/pkg/workloads" kapps "k8s.io/api/apps/v1" kcore "k8s.io/api/core/v1" ) -func GetStatus(apiName string) (*status.Status, error) { - var err error - deployment, err := config.K8s.GetDeployment(workloads.K8sName(apiName)) - if err != nil { - return nil, err - } - - if deployment == nil { - return nil, errors.ErrorUnexpected("unable to find deployment", apiName) - } - - return status.StatusFromDeployment(deployment), nil -} - -func GetAllStatuses(deployments []kapps.Deployment) ([]status.Status, error) { - statuses := make([]status.Status, len(deployments)) - for i := range deployments { - statuses[i] = *status.StatusFromDeployment(&deployments[i]) - } - - sort.Slice(statuses, func(i, j int) bool { - return statuses[i].APIName < statuses[j].APIName - }) - - return statuses, nil -} - func getReplicaCounts(deployment *kapps.Deployment, pods []kcore.Pod) status.ReplicaCounts { counts := status.ReplicaCounts{} counts.Requested = *deployment.Spec.Replicas diff --git a/pkg/operator/resources/resources.go b/pkg/operator/resources/resources.go index 768122d69d..690616e923 100644 --- a/pkg/operator/resources/resources.go +++ b/pkg/operator/resources/resources.go @@ -159,7 +159,7 @@ func UpdateAPI(apiConfig *userconfig.API, force bool) (*schema.APIResponse, stri return &schema.APIResponse{ Spec: api, - Endpoint: apiEndpoint, + Endpoint: &apiEndpoint, }, msg, nil } @@ -256,7 +256,7 @@ func DeleteAPI(apiName string, keepCache bool) (*schema.DeleteResponse, error) { func GetAPIs() ([]schema.APIResponse, error) { var deployments []kapps.Deployment var k8sTaskJobs []kbatch.Job - var pods []kcore.Pod + var taskAPIPods []kcore.Pod var virtualServices []istioclientnetworking.VirtualService var batchJobList batch.BatchJobList @@ -268,7 +268,7 @@ func GetAPIs() ([]schema.APIResponse, error) { }, func() error { var err error - pods, err = config.K8s.ListPodsWithLabelKeys("apiName") + taskAPIPods, err = config.K8s.ListPodsByLabel("apiKind", userconfig.TaskAPIKind.String()) return err }, func() error { @@ -308,23 +308,18 @@ func GetAPIs() ([]schema.APIResponse, error) { } } - var batchAPIPods []kcore.Pod - var taskAPIPods []kcore.Pod - for _, pod := range pods { - switch pod.Labels["apiKind"] { - case userconfig.BatchAPIKind.String(): - batchAPIPods = append(batchAPIPods, pod) - case userconfig.TaskAPIKind.String(): - taskAPIPods = append(taskAPIPods, pod) - } - } - + var realtimeAPIVirtualServices []istioclientnetworking.VirtualService + var asyncAPIVirtualServices []istioclientnetworking.VirtualService var batchAPIVirtualServices []istioclientnetworking.VirtualService var taskAPIVirtualServices []istioclientnetworking.VirtualService var trafficSplitterVirtualServices []istioclientnetworking.VirtualService for _, vs := range virtualServices { switch vs.Labels["apiKind"] { + case userconfig.RealtimeAPIKind.String(): + realtimeAPIVirtualServices = append(realtimeAPIVirtualServices, vs) + case userconfig.AsyncAPIKind.String(): + asyncAPIVirtualServices = append(asyncAPIVirtualServices, vs) case userconfig.BatchAPIKind.String(): batchAPIVirtualServices = append(batchAPIVirtualServices, vs) case userconfig.TrafficSplitterKind.String(): @@ -334,7 +329,7 @@ func GetAPIs() ([]schema.APIResponse, error) { } } - realtimeAPIList, err := realtimeapi.GetAllAPIs(realtimeAPIDeployments) + realtimeAPIList, err := realtimeapi.GetAllAPIs(realtimeAPIDeployments, realtimeAPIVirtualServices) if err != nil { return nil, err } @@ -350,7 +345,7 @@ func GetAPIs() ([]schema.APIResponse, error) { return nil, err } - asyncAPIList, err := asyncapi.GetAllAPIs(asyncAPIDeployments) + asyncAPIList, err := asyncapi.GetAllAPIs(asyncAPIDeployments, asyncAPIVirtualServices) if err != nil { return nil, err } diff --git a/pkg/operator/resources/trafficsplitter/api.go b/pkg/operator/resources/trafficsplitter/api.go index a1fba2b504..3492544d82 100644 --- a/pkg/operator/resources/trafficsplitter/api.go +++ b/pkg/operator/resources/trafficsplitter/api.go @@ -159,7 +159,7 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService) ([]schem trafficSplitters = append(trafficSplitters, schema.APIResponse{ Spec: &trafficSplitter, - Endpoint: endpoint, + Endpoint: &endpoint, }) } @@ -181,7 +181,7 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp return []schema.APIResponse{ { Spec: api, - Endpoint: endpoint, + Endpoint: &endpoint, }, }, nil } diff --git a/pkg/operator/schema/schema.go b/pkg/operator/schema/schema.go index 401b59a30c..522a927a2f 100644 --- a/pkg/operator/schema/schema.go +++ b/pkg/operator/schema/schema.go @@ -57,8 +57,9 @@ type DeployResult struct { type APIResponse struct { Spec *spec.API `json:"spec,omitempty"` + Metadata *spec.Metadata `json:"metadata,omitempty"` Status *status.Status `json:"status,omitempty"` - Endpoint string `json:"endpoint"` + Endpoint *string `json:"endpoint,omitempty"` DashboardURL *string `json:"dashboard_url,omitempty"` BatchJobStatuses []status.BatchJobStatus `json:"batch_job_statuses,omitempty"` TaskJobStatuses []status.TaskJobStatus `json:"task_job_statuses,omitempty"` diff --git a/pkg/types/spec/api.go b/pkg/types/spec/api.go index e181a0ffab..929f3fd51b 100644 --- a/pkg/types/spec/api.go +++ b/pkg/types/spec/api.go @@ -30,6 +30,7 @@ import ( "github.com/cortexlabs/cortex/pkg/lib/hash" s "github.com/cortexlabs/cortex/pkg/lib/strings" "github.com/cortexlabs/cortex/pkg/types/userconfig" + kapps "k8s.io/api/apps/v1" ) type API struct { @@ -46,6 +47,29 @@ type API struct { MetadataRoot string `json:"metadata_root"` } +type Metadata struct { + *userconfig.Resource + APIID string `json:"id"` + DeploymentID string `json:"deployment_id"` + LastUpdated int64 `json:"last_updated"` +} + +func MetadataFromDeployment(deployment *kapps.Deployment) (*Metadata, error) { + lastUpdated, err := TimeFromAPIID(deployment.Labels["apiID"]) + if err != nil { + return nil, err + } + return &Metadata{ + Resource: &userconfig.Resource{ + Name: deployment.Labels["apiName"], + Kind: userconfig.KindFromString(deployment.Labels["apiKind"]), + }, + APIID: deployment.Labels["apiID"], + DeploymentID: deployment.Labels["deploymentID"], + LastUpdated: lastUpdated.Unix(), + }, nil +} + /* * ID (uniquely identifies an api configuration for a given deployment) * DeploymentID (used for refreshing a deployment) diff --git a/pkg/types/status/status.go b/pkg/types/status/status.go index b2299bce5c..8a6a4dd160 100644 --- a/pkg/types/status/status.go +++ b/pkg/types/status/status.go @@ -17,17 +17,13 @@ limitations under the License. package status import ( - "github.com/cortexlabs/cortex/pkg/types/userconfig" kapps "k8s.io/api/apps/v1" ) type Status struct { - APIName string `json:"api_name"` - APIKind userconfig.Kind `json:"api_kind"` - APIID string `json:"api_id"` - Ready int32 `json:"ready"` - Requested int32 `json:"requested"` - UpToDate int32 `json:"up_to_date"` + Ready int32 `json:"ready"` + Requested int32 `json:"requested"` + UpToDate int32 `json:"up_to_date"` } type ReplicaCounts struct { @@ -62,9 +58,6 @@ type WorkerCounts struct { func StatusFromDeployment(deployment *kapps.Deployment) *Status { return &Status{ - APIName: deployment.Labels["apiName"], - APIKind: userconfig.KindFromString(deployment.Labels["apiKind"]), - APIID: deployment.Labels["apiID"], Ready: deployment.Status.ReadyReplicas, Requested: deployment.Status.Replicas, UpToDate: deployment.Status.UpdatedReplicas, From 4cde8f425bf0b0ddc3dc4d9bef23a0b901635ef2 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Wed, 21 Jul 2021 20:02:01 +0300 Subject: [PATCH 24/40] WIP on API statuses --- cli/cmd/lib_async_apis.go | 7 +++++-- cli/cmd/lib_task_apis.go | 7 +++++-- pkg/operator/resources/asyncapi/api.go | 3 +++ pkg/operator/resources/job/taskapi/api.go | 23 +++++++---------------- pkg/types/spec/api.go | 17 +++++++++++++++++ 5 files changed, 37 insertions(+), 20 deletions(-) diff --git a/cli/cmd/lib_async_apis.go b/cli/cmd/lib_async_apis.go index d810acad62..c0605014aa 100644 --- a/cli/cmd/lib_async_apis.go +++ b/cli/cmd/lib_async_apis.go @@ -60,10 +60,13 @@ func asyncAPIsTable(asyncAPIs []schema.APIResponse, envNames []string) table.Tab rows := make([][]interface{}, 0, len(asyncAPIs)) for i, asyncAPI := range asyncAPIs { - lastUpdated := time.Unix(asyncAPI.Spec.LastUpdated, 0) + if asyncAPI.Metadata == nil || asyncAPI.Status == nil { + continue + } + lastUpdated := time.Unix(asyncAPI.Metadata.LastUpdated, 0) rows = append(rows, []interface{}{ envNames[i], - asyncAPI.Spec.Name, + asyncAPI.Metadata.Name, fmt.Sprintf("%d/%d", asyncAPI.Status.Ready, asyncAPI.Status.Requested), asyncAPI.Status.UpToDate, libtime.SinceStr(&lastUpdated), diff --git a/cli/cmd/lib_task_apis.go b/cli/cmd/lib_task_apis.go index a639cee5f0..cda53e18b8 100644 --- a/cli/cmd/lib_task_apis.go +++ b/cli/cmd/lib_task_apis.go @@ -41,7 +41,10 @@ func taskAPIsTable(taskAPIs []schema.APIResponse, envNames []string) table.Table rows := make([][]interface{}, 0, len(taskAPIs)) for i, taskAPI := range taskAPIs { - lastAPIUpdated := time.Unix(taskAPI.Spec.LastUpdated, 0) + if taskAPI.Metadata == nil { + continue + } + lastAPIUpdated := time.Unix(taskAPI.Metadata.LastUpdated, 0) latestStartTime := time.Time{} latestJobID := "-" runningJobs := 0 @@ -59,7 +62,7 @@ func taskAPIsTable(taskAPIs []schema.APIResponse, envNames []string) table.Table rows = append(rows, []interface{}{ envNames[i], - taskAPI.Spec.Name, + taskAPI.Metadata.Name, runningJobs, latestJobID, libtime.SinceStr(&lastAPIUpdated), diff --git a/pkg/operator/resources/asyncapi/api.go b/pkg/operator/resources/asyncapi/api.go index 4c93657a3c..6cb180de69 100644 --- a/pkg/operator/resources/asyncapi/api.go +++ b/pkg/operator/resources/asyncapi/api.go @@ -257,6 +257,9 @@ func GetAllAPIs(deployments []kapps.Deployment, virtualServices []istioclientnet keys := make([]string, len(deployments)) for i := range deployments { + if deployments[i].Labels["cortex.dev/async"] != "api" { + continue + } apiName := deployments[i].Labels["apiName"] keys = append(keys, apiName) diff --git a/pkg/operator/resources/job/taskapi/api.go b/pkg/operator/resources/job/taskapi/api.go index 32aa249636..1c407a42a0 100644 --- a/pkg/operator/resources/job/taskapi/api.go +++ b/pkg/operator/resources/job/taskapi/api.go @@ -147,20 +147,12 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, k8sJobs } for _, virtualService := range virtualServices { - apiName := virtualService.Labels["apiName"] - apiID := virtualService.Labels["apiID"] - - api, err := operator.DownloadAPISpec(apiName, apiID) - if err != nil { - return nil, err - } - - endpoint, err := operator.APIEndpoint(api) + metadata, err := spec.MetadataFromVirtualService(&virtualService) if err != nil { - return nil, err + return nil, errors.Wrap(err, fmt.Sprintf("api %s", metadata.Name)) } - jobStates, err := job.GetMostRecentlySubmittedJobStates(apiName, 1, userconfig.TaskAPIKind) + jobStates, err := job.GetMostRecentlySubmittedJobStates(metadata.Name, 1, userconfig.TaskAPIKind) jobStatuses := []status.TaskJobStatus{} if len(jobStates) > 0 { @@ -172,9 +164,8 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, k8sJobs jobStatuses = append(jobStatuses, *jobStatus) } - taskAPIsMap[apiName] = &schema.APIResponse{ - Spec: api, - Endpoint: &endpoint, + taskAPIsMap[metadata.Name] = &schema.APIResponse{ + Metadata: metadata, TaskJobStatuses: jobStatuses, } } @@ -209,8 +200,8 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, k8sJobs taskAPIList := make([]schema.APIResponse, 0, len(taskAPIsMap)) - for _, batchAPI := range taskAPIsMap { - taskAPIList = append(taskAPIList, *batchAPI) + for _, taskAPI := range taskAPIsMap { + taskAPIList = append(taskAPIList, *taskAPI) } return taskAPIList, nil diff --git a/pkg/types/spec/api.go b/pkg/types/spec/api.go index 929f3fd51b..f2aaf22465 100644 --- a/pkg/types/spec/api.go +++ b/pkg/types/spec/api.go @@ -30,6 +30,7 @@ import ( "github.com/cortexlabs/cortex/pkg/lib/hash" s "github.com/cortexlabs/cortex/pkg/lib/strings" "github.com/cortexlabs/cortex/pkg/types/userconfig" + istioclientnetworking "istio.io/client-go/pkg/apis/networking/v1beta1" kapps "k8s.io/api/apps/v1" ) @@ -70,6 +71,22 @@ func MetadataFromDeployment(deployment *kapps.Deployment) (*Metadata, error) { }, nil } +func MetadataFromVirtualService(vs *istioclientnetworking.VirtualService) (*Metadata, error) { + lastUpdated, err := TimeFromAPIID(vs.Labels["apiID"]) + if err != nil { + return nil, err + } + return &Metadata{ + Resource: &userconfig.Resource{ + Name: vs.Labels["apiName"], + Kind: userconfig.KindFromString(vs.Labels["apiKind"]), + }, + APIID: vs.Labels["apiID"], + DeploymentID: vs.Labels["deploymentID"], + LastUpdated: lastUpdated.Unix(), + }, nil +} + /* * ID (uniquely identifies an api configuration for a given deployment) * DeploymentID (used for refreshing a deployment) From d7ca13fb80060e83960be0b6fcedd1515feebe2a Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Wed, 21 Jul 2021 21:35:42 +0300 Subject: [PATCH 25/40] WIP on API statuses --- cli/cmd/get.go | 4 -- cli/cmd/lib_async_apis.go | 2 - cli/cmd/lib_batch_apis.go | 7 +++- cli/cmd/lib_realtime_apis.go | 4 -- cli/cmd/lib_traffic_splitters.go | 12 ++++-- pkg/operator/resources/asyncapi/api.go | 14 +++---- pkg/operator/resources/job/batchapi/api.go | 29 ++++++------- pkg/operator/resources/job/taskapi/api.go | 13 ++++-- pkg/operator/resources/realtimeapi/api.go | 12 +++--- pkg/operator/resources/resources.go | 13 +++--- pkg/operator/resources/trafficsplitter/api.go | 41 ++++++++----------- pkg/types/spec/api.go | 2 +- 12 files changed, 70 insertions(+), 83 deletions(-) diff --git a/cli/cmd/get.go b/cli/cmd/get.go index f2356fb69a..05d6a2e980 100644 --- a/cli/cmd/get.go +++ b/cli/cmd/get.go @@ -219,10 +219,6 @@ func getAPIsInAllEnvironments() (string, error) { if err == nil { for _, api := range apisRes { - if api.Metadata == nil { - // TODO remove this once the status is present for all - continue - } switch api.Metadata.Kind { case userconfig.BatchAPIKind: allBatchAPIEnvs = append(allBatchAPIEnvs, env.Name) diff --git a/cli/cmd/lib_async_apis.go b/cli/cmd/lib_async_apis.go index c0605014aa..ea42397017 100644 --- a/cli/cmd/lib_async_apis.go +++ b/cli/cmd/lib_async_apis.go @@ -32,8 +32,6 @@ func asyncAPITable(asyncAPI schema.APIResponse, env cliconfig.Environment) (stri var out string t := asyncAPIsTable([]schema.APIResponse{asyncAPI}, []string{env.Name}) - t.FindHeaderByTitle(_titleEnvironment).Hidden = true - t.FindHeaderByTitle(_titleAsyncAPI).Hidden = true out += t.MustFormat() diff --git a/cli/cmd/lib_batch_apis.go b/cli/cmd/lib_batch_apis.go index 1499ac8869..3bc2992acb 100644 --- a/cli/cmd/lib_batch_apis.go +++ b/cli/cmd/lib_batch_apis.go @@ -43,7 +43,10 @@ func batchAPIsTable(batchAPIs []schema.APIResponse, envNames []string) table.Tab rows := make([][]interface{}, 0, len(batchAPIs)) for i, batchAPI := range batchAPIs { - lastAPIUpdated := time.Unix(batchAPI.Spec.LastUpdated, 0) + if batchAPI.Metadata == nil { + continue + } + lastAPIUpdated := time.Unix(batchAPI.Metadata.LastUpdated, 0) latestStartTime := time.Time{} latestJobID := "-" runningJobs := 0 @@ -61,7 +64,7 @@ func batchAPIsTable(batchAPIs []schema.APIResponse, envNames []string) table.Tab rows = append(rows, []interface{}{ envNames[i], - batchAPI.Spec.Name, + batchAPI.Metadata.Name, runningJobs, latestJobID, libtime.SinceStr(&lastAPIUpdated), diff --git a/cli/cmd/lib_realtime_apis.go b/cli/cmd/lib_realtime_apis.go index 36abdaff87..128e5df06d 100644 --- a/cli/cmd/lib_realtime_apis.go +++ b/cli/cmd/lib_realtime_apis.go @@ -32,10 +32,6 @@ func realtimeAPITable(realtimeAPI schema.APIResponse, env cliconfig.Environment) var out string t := realtimeAPIsTable([]schema.APIResponse{realtimeAPI}, []string{env.Name}) - // TODO decide on whether we want to keep this consistent with `cortex get` command - // t.FindHeaderByTitle(_titleEnvironment).Hidden = true - // t.FindHeaderByTitle(_titleRealtimeAPI).Hidden = true - out += t.MustFormat() if realtimeAPI.DashboardURL != nil && *realtimeAPI.DashboardURL != "" { diff --git a/cli/cmd/lib_traffic_splitters.go b/cli/cmd/lib_traffic_splitters.go index cc23e00a87..6b1b1a8837 100644 --- a/cli/cmd/lib_traffic_splitters.go +++ b/cli/cmd/lib_traffic_splitters.go @@ -76,7 +76,10 @@ func trafficSplitTable(trafficSplitter schema.APIResponse, env cliconfig.Environ } apiRes := apisRes[0] - lastUpdated := time.Unix(apiRes.Spec.LastUpdated, 0) + if apiRes.Metadata == nil || apiRes.Status == nil { + continue + } + lastUpdated := time.Unix(apiRes.Metadata.LastUpdated, 0) apiName := apiRes.Spec.Name if api.Shadow { @@ -108,7 +111,10 @@ func trafficSplitTable(trafficSplitter schema.APIResponse, env cliconfig.Environ func trafficSplitterListTable(trafficSplitter []schema.APIResponse, envNames []string) table.Table { rows := make([][]interface{}, 0, len(trafficSplitter)) for i, splitAPI := range trafficSplitter { - lastUpdated := time.Unix(splitAPI.Spec.LastUpdated, 0) + if splitAPI.Metadata == nil || splitAPI.Spec == nil { + continue + } + lastUpdated := time.Unix(splitAPI.Metadata.LastUpdated, 0) var apis []string for _, api := range splitAPI.Spec.APIs { apiName := api.Name @@ -120,7 +126,7 @@ func trafficSplitterListTable(trafficSplitter []schema.APIResponse, envNames []s apisStr := s.TruncateEllipses(strings.Join(apis, " "), 50) rows = append(rows, []interface{}{ envNames[i], - splitAPI.Spec.Name, + splitAPI.Metadata.Name, apisStr, libtime.SinceStr(&lastUpdated), }) diff --git a/pkg/operator/resources/asyncapi/api.go b/pkg/operator/resources/asyncapi/api.go index 6cb180de69..26553ecab8 100644 --- a/pkg/operator/resources/asyncapi/api.go +++ b/pkg/operator/resources/asyncapi/api.go @@ -251,17 +251,17 @@ func DeleteAPI(apiName string, keepCache bool) error { return nil } -func GetAllAPIs(deployments []kapps.Deployment, virtualServices []istioclientnetworking.VirtualService) ([]schema.APIResponse, error) { - asyncAPIs := make([]schema.APIResponse, len(deployments)) - mappedAsyncAPIs := make(map[string]schema.APIResponse, len(deployments)) - keys := make([]string, len(deployments)) +func GetAllAPIs(deployments []kapps.Deployment) ([]schema.APIResponse, error) { + asyncAPIs := make([]schema.APIResponse, 0) + mappedAsyncAPIs := make(map[string]schema.APIResponse, 0) + apiNames := make([]string, 0) for i := range deployments { if deployments[i].Labels["cortex.dev/async"] != "api" { continue } apiName := deployments[i].Labels["apiName"] - keys = append(keys, apiName) + apiNames = append(apiNames, apiName) metadata, err := spec.MetadataFromDeployment(&deployments[i]) if err != nil { @@ -273,8 +273,8 @@ func GetAllAPIs(deployments []kapps.Deployment, virtualServices []istioclientnet } } - sort.Strings(keys) - for _, apiName := range keys { + sort.Strings(apiNames) + for _, apiName := range apiNames { asyncAPIs = append(asyncAPIs, mappedAsyncAPIs[apiName]) } diff --git a/pkg/operator/resources/job/batchapi/api.go b/pkg/operator/resources/job/batchapi/api.go index 6b230847c6..8d87040eef 100644 --- a/pkg/operator/resources/job/batchapi/api.go +++ b/pkg/operator/resources/job/batchapi/api.go @@ -142,23 +142,16 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, batchJob for _, virtualService := range virtualServices { apiName := virtualService.Labels["apiName"] - apiID := virtualService.Labels["apiID"] - - api, err := operator.DownloadAPISpec(apiName, apiID) - if err != nil { - return nil, err - } - - endpoint, err := operator.APIEndpoint(api) + metadata, err := spec.MetadataFromVirtualService(&virtualService) if err != nil { - return nil, err + return nil, errors.Wrap(err, fmt.Sprintf("api %s", apiName)) } var jobStatuses []status.BatchJobStatus - batchJobs := apiNameToBatchJobsMap[apiName] + batchJobs := apiNameToBatchJobsMap[metadata.Name] if len(batchJobs) == 0 { - jobStates, err := job.GetMostRecentlySubmittedJobStates(apiName, 1, userconfig.BatchAPIKind) + jobStates, err := job.GetMostRecentlySubmittedJobStates(metadata.Name, 1, userconfig.BatchAPIKind) if err != nil { return nil, err } @@ -183,9 +176,8 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, batchJob } } - batchAPIsMap[apiName] = &schema.APIResponse{ - Spec: api, - Endpoint: &endpoint, + batchAPIsMap[metadata.Name] = &schema.APIResponse{ + Metadata: metadata, BatchJobStatuses: jobStatuses, } } @@ -200,10 +192,12 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, batchJob } func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResponse, error) { - virtualService := deployedResource.VirtualService + metadata, err := spec.MetadataFromVirtualService(deployedResource.VirtualService) + if err != nil { + return nil, err + } - apiID := virtualService.Labels["apiID"] - api, err := operator.DownloadAPISpec(deployedResource.Name, apiID) + api, err := operator.DownloadAPISpec(deployedResource.Name, metadata.APIID) if err != nil { return nil, err } @@ -264,6 +258,7 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp return []schema.APIResponse{ { Spec: api, + Metadata: metadata, BatchJobStatuses: jobStatuses, Endpoint: &endpoint, DashboardURL: dashboardURL, diff --git a/pkg/operator/resources/job/taskapi/api.go b/pkg/operator/resources/job/taskapi/api.go index 1c407a42a0..6c6afdf425 100644 --- a/pkg/operator/resources/job/taskapi/api.go +++ b/pkg/operator/resources/job/taskapi/api.go @@ -147,9 +147,11 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, k8sJobs } for _, virtualService := range virtualServices { + apiName := virtualService.Labels["apiName"] + metadata, err := spec.MetadataFromVirtualService(&virtualService) if err != nil { - return nil, errors.Wrap(err, fmt.Sprintf("api %s", metadata.Name)) + return nil, errors.Wrap(err, fmt.Sprintf("api %s", apiName)) } jobStates, err := job.GetMostRecentlySubmittedJobStates(metadata.Name, 1, userconfig.TaskAPIKind) @@ -209,10 +211,12 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, k8sJobs // GetAPIByName returns a single task API and its most recently submitted job along with all running task jobs func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResponse, error) { - virtualService := deployedResource.VirtualService + metadata, err := spec.MetadataFromVirtualService(deployedResource.VirtualService) + if err != nil { + return nil, err + } - apiID := virtualService.Labels["apiID"] - api, err := operator.DownloadAPISpec(deployedResource.Name, apiID) + api, err := operator.DownloadAPISpec(deployedResource.Name, metadata.APIID) if err != nil { return nil, err } @@ -287,6 +291,7 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp return []schema.APIResponse{ { Spec: api, + Metadata: metadata, TaskJobStatuses: jobStatuses, Endpoint: &endpoint, DashboardURL: dashboardURL, diff --git a/pkg/operator/resources/realtimeapi/api.go b/pkg/operator/resources/realtimeapi/api.go index 14d24b3e04..d787f5c99d 100644 --- a/pkg/operator/resources/realtimeapi/api.go +++ b/pkg/operator/resources/realtimeapi/api.go @@ -176,14 +176,14 @@ func DeleteAPI(apiName string, keepCache bool) error { return nil } -func GetAllAPIs(deployments []kapps.Deployment, virtualServices []istioclientnetworking.VirtualService) ([]schema.APIResponse, error) { +func GetAllAPIs(deployments []kapps.Deployment) ([]schema.APIResponse, error) { realtimeAPIs := make([]schema.APIResponse, len(deployments)) mappedRealtimeAPIs := make(map[string]schema.APIResponse, len(deployments)) - keys := make([]string, len(deployments)) + apiNames := make([]string, len(deployments)) for i := range deployments { apiName := deployments[i].Labels["apiName"] - keys = append(keys, apiName) + apiNames[i] = apiName metadata, err := spec.MetadataFromDeployment(&deployments[i]) if err != nil { @@ -195,9 +195,9 @@ func GetAllAPIs(deployments []kapps.Deployment, virtualServices []istioclientnet } } - sort.Strings(keys) - for _, apiName := range keys { - realtimeAPIs = append(realtimeAPIs, mappedRealtimeAPIs[apiName]) + sort.Strings(apiNames) + for i := range apiNames { + realtimeAPIs[i] = mappedRealtimeAPIs[apiNames[i]] } return realtimeAPIs, nil diff --git a/pkg/operator/resources/resources.go b/pkg/operator/resources/resources.go index 690616e923..5350f99e47 100644 --- a/pkg/operator/resources/resources.go +++ b/pkg/operator/resources/resources.go @@ -308,18 +308,15 @@ func GetAPIs() ([]schema.APIResponse, error) { } } - var realtimeAPIVirtualServices []istioclientnetworking.VirtualService - var asyncAPIVirtualServices []istioclientnetworking.VirtualService + fmt.Println("realtimeAPIDeployments", len(realtimeAPIDeployments)) + fmt.Println("asyncAPIDeployments", len(asyncAPIDeployments)) + var batchAPIVirtualServices []istioclientnetworking.VirtualService var taskAPIVirtualServices []istioclientnetworking.VirtualService var trafficSplitterVirtualServices []istioclientnetworking.VirtualService for _, vs := range virtualServices { switch vs.Labels["apiKind"] { - case userconfig.RealtimeAPIKind.String(): - realtimeAPIVirtualServices = append(realtimeAPIVirtualServices, vs) - case userconfig.AsyncAPIKind.String(): - asyncAPIVirtualServices = append(asyncAPIVirtualServices, vs) case userconfig.BatchAPIKind.String(): batchAPIVirtualServices = append(batchAPIVirtualServices, vs) case userconfig.TrafficSplitterKind.String(): @@ -329,7 +326,7 @@ func GetAPIs() ([]schema.APIResponse, error) { } } - realtimeAPIList, err := realtimeapi.GetAllAPIs(realtimeAPIDeployments, realtimeAPIVirtualServices) + realtimeAPIList, err := realtimeapi.GetAllAPIs(realtimeAPIDeployments) if err != nil { return nil, err } @@ -345,7 +342,7 @@ func GetAPIs() ([]schema.APIResponse, error) { return nil, err } - asyncAPIList, err := asyncapi.GetAllAPIs(asyncAPIDeployments, asyncAPIVirtualServices) + asyncAPIList, err := asyncapi.GetAllAPIs(asyncAPIDeployments) if err != nil { return nil, err } diff --git a/pkg/operator/resources/trafficsplitter/api.go b/pkg/operator/resources/trafficsplitter/api.go index 3492544d82..a30a7579fd 100644 --- a/pkg/operator/resources/trafficsplitter/api.go +++ b/pkg/operator/resources/trafficsplitter/api.go @@ -132,35 +132,20 @@ func getTrafficSplitterDestinations(trafficSplitter *spec.API) []k8s.Destination // GetAllAPIs returns a list of metadata, in the form of schema.APIResponse, about all the created traffic splitter APIs func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService) ([]schema.APIResponse, error) { - var ( - apiNames []string - apiIDs []string - trafficSplitters []schema.APIResponse - ) - + var trafficSplitters []schema.APIResponse for _, virtualService := range virtualServices { - if virtualService.Labels["apiKind"] == userconfig.TrafficSplitterKind.String() { - apiNames = append(apiNames, virtualService.Labels["apiName"]) - apiIDs = append(apiIDs, virtualService.Labels["apiID"]) - } - } - - apis, err := operator.DownloadAPISpecs(apiNames, apiIDs) - if err != nil { - return nil, err - } + apiName := virtualService.Labels["apiName"] - for i := range apis { - trafficSplitter := apis[i] - endpoint, err := operator.APIEndpoint(&trafficSplitter) + metadata, err := spec.MetadataFromVirtualService(&virtualService) if err != nil { - return nil, err + return nil, errors.Wrap(err, fmt.Sprintf("api %s", apiName)) } - trafficSplitters = append(trafficSplitters, schema.APIResponse{ - Spec: &trafficSplitter, - Endpoint: &endpoint, - }) + if metadata.Kind == userconfig.TrafficSplitterKind { + trafficSplitters = append(trafficSplitters, schema.APIResponse{ + Metadata: metadata, + }) + } } return trafficSplitters, nil @@ -168,7 +153,12 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService) ([]schem // GetAPIByName retrieves the metadata, in the form of schema.APIResponse, of a single traffic splitter API func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResponse, error) { - api, err := operator.DownloadAPISpec(deployedResource.Name, deployedResource.VirtualService.Labels["apiID"]) + metadata, err := spec.MetadataFromVirtualService(deployedResource.VirtualService) + if err != nil { + return nil, err + } + + api, err := operator.DownloadAPISpec(deployedResource.Name, metadata.APIID) if err != nil { return nil, err } @@ -181,6 +171,7 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp return []schema.APIResponse{ { Spec: api, + Metadata: metadata, Endpoint: &endpoint, }, }, nil diff --git a/pkg/types/spec/api.go b/pkg/types/spec/api.go index f2aaf22465..b229962aee 100644 --- a/pkg/types/spec/api.go +++ b/pkg/types/spec/api.go @@ -51,7 +51,7 @@ type API struct { type Metadata struct { *userconfig.Resource APIID string `json:"id"` - DeploymentID string `json:"deployment_id"` + DeploymentID string `json:"deployment_id,omitempty"` LastUpdated int64 `json:"last_updated"` } From 34dc5a3feaab9a64b4ad75f3bce320e26dc44608 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Thu, 22 Jul 2021 17:16:55 +0300 Subject: [PATCH 26/40] Changes to the TrafficSplitter --- cli/cmd/lib_traffic_splitters.go | 14 ++------------ pkg/activator/helpers.go | 1 + pkg/operator/resources/trafficsplitter/api.go | 11 +++++++++++ pkg/types/userconfig/api.go | 12 ++++++++++++ pkg/types/userconfig/config_key.go | 1 + 5 files changed, 27 insertions(+), 12 deletions(-) diff --git a/cli/cmd/lib_traffic_splitters.go b/cli/cmd/lib_traffic_splitters.go index 6b1b1a8837..8eaf6b048b 100644 --- a/cli/cmd/lib_traffic_splitters.go +++ b/cli/cmd/lib_traffic_splitters.go @@ -45,7 +45,6 @@ func trafficSplitterTable(trafficSplitter schema.APIResponse, env cliconfig.Envi if err != nil { return "", err } - t.FindHeaderByTitle(_titleEnvironment).Hidden = true out += t.MustFormat() @@ -111,23 +110,14 @@ func trafficSplitTable(trafficSplitter schema.APIResponse, env cliconfig.Environ func trafficSplitterListTable(trafficSplitter []schema.APIResponse, envNames []string) table.Table { rows := make([][]interface{}, 0, len(trafficSplitter)) for i, splitAPI := range trafficSplitter { - if splitAPI.Metadata == nil || splitAPI.Spec == nil { + if splitAPI.Metadata == nil || splitAPI.Status == nil { continue } lastUpdated := time.Unix(splitAPI.Metadata.LastUpdated, 0) - var apis []string - for _, api := range splitAPI.Spec.APIs { - apiName := api.Name - if api.Shadow { - apiName += " (shadow)" - } - apis = append(apis, apiName+":"+s.Int32(api.Weight)) - } - apisStr := s.TruncateEllipses(strings.Join(apis, " "), 50) rows = append(rows, []interface{}{ envNames[i], splitAPI.Metadata.Name, - apisStr, + s.Int32(splitAPI.Status.Ready), libtime.SinceStr(&lastUpdated), }) } diff --git a/pkg/activator/helpers.go b/pkg/activator/helpers.go index f32c7e54f2..48790b5ac0 100644 --- a/pkg/activator/helpers.go +++ b/pkg/activator/helpers.go @@ -66,6 +66,7 @@ func getAPIMeta(obj interface{}) (apiMeta, error) { }, nil } +// TODO move this out of here func concurrencyFromAnnotations(annotations map[string]string) (int, int, error) { maxQueueLength, err := strconv.Atoi(annotations[userconfig.MaxQueueLengthAnnotationKey]) if err != nil { diff --git a/pkg/operator/resources/trafficsplitter/api.go b/pkg/operator/resources/trafficsplitter/api.go index a30a7579fd..fa02f8ec35 100644 --- a/pkg/operator/resources/trafficsplitter/api.go +++ b/pkg/operator/resources/trafficsplitter/api.go @@ -30,6 +30,7 @@ import ( "github.com/cortexlabs/cortex/pkg/operator/operator" "github.com/cortexlabs/cortex/pkg/operator/schema" "github.com/cortexlabs/cortex/pkg/types/spec" + "github.com/cortexlabs/cortex/pkg/types/status" "github.com/cortexlabs/cortex/pkg/types/userconfig" "github.com/cortexlabs/cortex/pkg/workloads" istioclientnetworking "istio.io/client-go/pkg/apis/networking/v1beta1" @@ -141,9 +142,19 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService) ([]schem return nil, errors.Wrap(err, fmt.Sprintf("api %s", apiName)) } + targets, err := userconfig.TrafficSplitterTargetsFromAnnotations(&virtualService) + if err != nil { + return nil, errors.Wrap(err, fmt.Sprintf("api %s", apiName)) + } + if metadata.Kind == userconfig.TrafficSplitterKind { trafficSplitters = append(trafficSplitters, schema.APIResponse{ Metadata: metadata, + Status: &status.Status{ + Ready: targets, + Requested: targets, + UpToDate: targets, + }, }) } } diff --git a/pkg/types/userconfig/api.go b/pkg/types/userconfig/api.go index c2f8585941..1872187f8c 100644 --- a/pkg/types/userconfig/api.go +++ b/pkg/types/userconfig/api.go @@ -155,6 +155,10 @@ func IdentifyAPI(filePath string, name string, kind Kind, index int) string { func (api *API) ToK8sAnnotations() map[string]string { annotations := map[string]string{} + if len(api.APIs) > 0 { + annotations[NumberOfTrafficSplitterTargets] = s.Int32(int32(len(api.APIs))) + } + if api.Pod != nil && api.Kind == RealtimeAPIKind { annotations[MaxConcurrencyAnnotationKey] = s.Int64(api.Pod.MaxConcurrency) annotations[MaxQueueLengthAnnotationKey] = s.Int64(api.Pod.MaxQueueLength) @@ -245,6 +249,14 @@ func AutoscalingFromAnnotations(k8sObj kmeta.Object) (*Autoscaling, error) { return &a, nil } +func TrafficSplitterTargetsFromAnnotations(k8sObj kmeta.Object) (int32, error) { + targets, err := k8s.ParseInt32Annotation(k8sObj, NumberOfTrafficSplitterTargets) + if err != nil { + return 0, err + } + return targets, nil +} + func (api *API) UserStr() string { var sb strings.Builder sb.WriteString(fmt.Sprintf("%s: %s\n", NameKey, api.Name)) diff --git a/pkg/types/userconfig/config_key.go b/pkg/types/userconfig/config_key.go index 826e144b05..263f764bd6 100644 --- a/pkg/types/userconfig/config_key.go +++ b/pkg/types/userconfig/config_key.go @@ -91,6 +91,7 @@ const ( EndpointAnnotationKey = "networking.cortex.dev/endpoint" MaxConcurrencyAnnotationKey = "pod.cortex.dev/max-concurrency" MaxQueueLengthAnnotationKey = "pod.cortex.dev/max-queue-length" + NumberOfTrafficSplitterTargets = "apis.cortex.dev/traffic-splitter-targets" MinReplicasAnnotationKey = "autoscaling.cortex.dev/min-replicas" MaxReplicasAnnotationKey = "autoscaling.cortex.dev/max-replicas" TargetInFlightAnnotationKey = "autoscaling.cortex.dev/target-in-flight" From 51606a799a6f1756344a245d5dd951d1fa7da1e0 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Fri, 23 Jul 2021 23:19:11 +0300 Subject: [PATCH 27/40] WIP on API statuses --- cli/cluster/get.go | 14 ++ cli/cmd/describe.go | 113 ++++++++++++++ cli/cmd/get.go | 8 +- cli/cmd/lib_apis.go | 59 ++++++++ cli/cmd/lib_async_apis.go | 27 ++++ cli/cmd/lib_batch_apis.go | 32 ++-- cli/cmd/lib_realtime_apis.go | 27 ++++ cli/cmd/lib_task_apis.go | 32 ++-- cli/cmd/lib_watch.go | 4 +- cli/cmd/root.go | 2 + cmd/operator/main.go | 1 + pkg/consts/consts.go | 3 +- .../crd/bases/batch.cortex.dev_batchjobs.yaml | 19 ++- .../batch/batchjob_controller_helpers.go | 86 +++++++---- pkg/lib/k8s/pod.go | 140 +++++++++++++----- pkg/operator/endpoints/describe.go | 36 +++++ pkg/operator/operator/k8s.go | 11 +- pkg/operator/resources/asyncapi/api.go | 127 ++++++++++++++-- pkg/operator/resources/asyncapi/status.go | 137 +++-------------- pkg/operator/resources/job/worker_stats.go | 31 ++-- pkg/operator/resources/realtimeapi/api.go | 43 +++++- pkg/operator/resources/realtimeapi/status.go | 77 +++------- pkg/operator/resources/resources.go | 33 ++++- pkg/types/status/code.go | 97 ------------ pkg/types/status/status.go | 103 ++++++++++--- pkg/types/userconfig/api.go | 8 + 26 files changed, 847 insertions(+), 423 deletions(-) create mode 100644 cli/cmd/describe.go create mode 100644 cli/cmd/lib_apis.go create mode 100644 pkg/operator/endpoints/describe.go delete mode 100644 pkg/types/status/code.go diff --git a/cli/cluster/get.go b/cli/cluster/get.go index 47a24aa0a3..6d88e707b8 100644 --- a/cli/cluster/get.go +++ b/cli/cluster/get.go @@ -51,6 +51,20 @@ func GetAPI(operatorConfig OperatorConfig, apiName string) ([]schema.APIResponse return apiRes, nil } +func DescribeAPI(operatorConfig OperatorConfig, apiName string) ([]schema.APIResponse, error) { + httpRes, err := HTTPGet(operatorConfig, "/describe/"+apiName) + if err != nil { + return nil, err + } + + var apiRes []schema.APIResponse + if err = json.Unmarshal(httpRes, &apiRes); err != nil { + return nil, errors.Wrap(err, "/describe/"+apiName, string(httpRes)) + } + + return apiRes, nil +} + func GetAPIByID(operatorConfig OperatorConfig, apiName string, apiID string) ([]schema.APIResponse, error) { httpRes, err := HTTPGet(operatorConfig, "/get/"+apiName+"/"+apiID) if err != nil { diff --git a/cli/cmd/describe.go b/cli/cmd/describe.go new file mode 100644 index 0000000000..be23ef6532 --- /dev/null +++ b/cli/cmd/describe.go @@ -0,0 +1,113 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cmd + +import ( + "fmt" + + "github.com/cortexlabs/cortex/cli/cluster" + "github.com/cortexlabs/cortex/cli/types/cliconfig" + "github.com/cortexlabs/cortex/pkg/lib/errors" + "github.com/cortexlabs/cortex/pkg/lib/exit" + "github.com/cortexlabs/cortex/pkg/lib/telemetry" + "github.com/cortexlabs/cortex/pkg/types/userconfig" + "github.com/spf13/cobra" +) + +const ( + _titleReplicaStatus = "replica status" + _titleReplicaCount = "replica count" +) + +var ( + _flagDescribeEnv string + _flagDescribeWatch bool +) + +func describeInit() { + _describeCmd.Flags().SortFlags = false + _describeCmd.Flags().StringVarP(&_flagDescribeEnv, "env", "e", "", "environment to use") + _describeCmd.Flags().BoolVarP(&_flagDescribeWatch, "watch", "w", false, "re-run the command every 2 seconds") +} + +var _describeCmd = &cobra.Command{ + Use: "describe [API_NAME]", + Short: "describe an api", + Args: cobra.ExactArgs(1), + Run: func(cmd *cobra.Command, args []string) { + apiName := args[0] + + var envName string + if wasFlagProvided(cmd, "env") { + envName = _flagDescribeEnv + } else { + var err error + envName, err = getEnvFromFlag("") + if err != nil { + telemetry.Event("cli.describe") + exit.Error(err) + } + } + + env, err := ReadOrConfigureEnv(envName) + if err != nil { + telemetry.Event("cli.describe") + exit.Error(err) + } + telemetry.Event("cli.describe", map[string]interface{}{"env_name": env.Name}) + + rerun(_flagDescribeWatch, func() (string, error) { + env, err := ReadOrConfigureEnv(envName) + if err != nil { + exit.Error(err) + } + + out, err := envStringIfNotSpecified(envName, cmd) + if err != nil { + return "", err + } + apiTable, err := describeAPI(env, apiName) + if err != nil { + return "", err + } + + return out + apiTable, nil + }) + }, +} + +func describeAPI(env cliconfig.Environment, apiName string) (string, error) { + apisRes, err := cluster.DescribeAPI(MustGetOperatorConfig(env.Name), apiName) + if err != nil { + return "", err + } + + if len(apisRes) == 0 { + exit.Error(errors.ErrorUnexpected(fmt.Sprintf("unable to find API %s", apiName))) + } + + apiRes := apisRes[0] + + switch apiRes.Metadata.Kind { + case userconfig.RealtimeAPIKind: + return realtimeDescribeAPITable(apiRes, env) + case userconfig.AsyncAPIKind: + return asyncDescribeAPITable(apiRes, env) + default: + return "", errors.ErrorUnexpected(fmt.Sprintf("encountered unexpected kind %s for api %s", apiRes.Spec.Kind, apiRes.Spec.Name)) + } +} diff --git a/cli/cmd/get.go b/cli/cmd/get.go index 05d6a2e980..e0083dfa99 100644 --- a/cli/cmd/get.go +++ b/cli/cmd/get.go @@ -48,14 +48,14 @@ const ( ) var ( - _flagGetEnv string - _flagWatch bool + _flagGetEnv string + _flagGetWatch bool ) func getInit() { _getCmd.Flags().SortFlags = false _getCmd.Flags().StringVarP(&_flagGetEnv, "env", "e", "", "environment to use") - _getCmd.Flags().BoolVarP(&_flagWatch, "watch", "w", false, "re-run the command every 2 seconds") + _getCmd.Flags().BoolVarP(&_flagGetWatch, "watch", "w", false, "re-run the command every 2 seconds") _getCmd.Flags().VarP(&_flagOutput, "output", "o", fmt.Sprintf("output format: one of %s", strings.Join(flags.OutputTypeStringsExcluding(flags.YAMLOutputType), "|"))) addVerboseFlag(_getCmd) } @@ -88,7 +88,7 @@ var _getCmd = &cobra.Command{ telemetry.Event("cli.get") } - rerun(func() (string, error) { + rerun(_flagGetWatch, func() (string, error) { if len(args) == 1 { env, err := ReadOrConfigureEnv(envName) if err != nil { diff --git a/cli/cmd/lib_apis.go b/cli/cmd/lib_apis.go new file mode 100644 index 0000000000..23514342dc --- /dev/null +++ b/cli/cmd/lib_apis.go @@ -0,0 +1,59 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cmd + +import ( + "github.com/cortexlabs/cortex/pkg/lib/table" + "github.com/cortexlabs/cortex/pkg/types/status" +) + +func replicaCountTable(counts *status.ReplicaCounts) table.Table { + var rows [][]interface{} + for _, replicaCountType := range status.ReplicaCountTypes { + count := counts.GetCountBy(replicaCountType) + canBeHiddenIfZero := false + switch replicaCountType { + case status.ReplicaCountFailed: + canBeHiddenIfZero = true + case status.ReplicaCountKilled: + canBeHiddenIfZero = true + case status.ReplicaCountKilledOOM: + canBeHiddenIfZero = true + case status.ReplicaCountErrImagePull: + canBeHiddenIfZero = true + case status.ReplicaCountUnknown: + canBeHiddenIfZero = true + case status.ReplicaCountStalled: + canBeHiddenIfZero = true + } + if count == 0 && canBeHiddenIfZero { + continue + } + rows = append(rows, []interface{}{ + replicaCountType, + count, + }) + } + + return table.Table{ + Headers: []table.Header{ + {Title: _titleReplicaStatus, MinWidth: 32, MaxWidth: 32}, + {Title: _titleReplicaCount}, + }, + Rows: rows, + } +} diff --git a/cli/cmd/lib_async_apis.go b/cli/cmd/lib_async_apis.go index ea42397017..e2e4441003 100644 --- a/cli/cmd/lib_async_apis.go +++ b/cli/cmd/lib_async_apis.go @@ -23,6 +23,7 @@ import ( "github.com/cortexlabs/cortex/cli/types/cliconfig" "github.com/cortexlabs/cortex/pkg/lib/console" + "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/table" libtime "github.com/cortexlabs/cortex/pkg/lib/time" "github.com/cortexlabs/cortex/pkg/operator/schema" @@ -54,6 +55,32 @@ func asyncAPITable(asyncAPI schema.APIResponse, env cliconfig.Environment) (stri return out, nil } +func asyncDescribeAPITable(asyncAPI schema.APIResponse, env cliconfig.Environment) (string, error) { + if asyncAPI.Metadata == nil { + return "", errors.ErrorUnexpected("missing metadata from operator response") + } + + if asyncAPI.Status == nil { + return "", errors.ErrorUnexpected(fmt.Sprintf("missing status for %s api", asyncAPI.Metadata.Name)) + } + + t := asyncAPIsTable([]schema.APIResponse{asyncAPI}, []string{env.Name}) + out := t.MustFormat() + + if asyncAPI.DashboardURL != nil && *asyncAPI.DashboardURL != "" { + out += "\n" + console.Bold("metrics dashboard: ") + *asyncAPI.DashboardURL + "\n" + } + + if asyncAPI.Endpoint != nil { + out += "\n" + console.Bold("endpoint: ") + *asyncAPI.Endpoint + "\n" + } + + t = replicaCountTable(asyncAPI.Status.ReplicaCounts) + out += "\n" + t.MustFormat() + + return out, nil +} + func asyncAPIsTable(asyncAPIs []schema.APIResponse, envNames []string) table.Table { rows := make([][]interface{}, 0, len(asyncAPIs)) diff --git a/cli/cmd/lib_batch_apis.go b/cli/cmd/lib_batch_apis.go index 3bc2992acb..5cebcdd2ba 100644 --- a/cli/cmd/lib_batch_apis.go +++ b/cli/cmd/lib_batch_apis.go @@ -221,22 +221,34 @@ func getBatchJob(env cliconfig.Environment, apiName string, jobID string) (strin if job.WorkerCounts != nil { t := table.Table{ Headers: []table.Header{ - {Title: "requested"}, - {Title: "pending", Hidden: job.WorkerCounts.Pending == 0}, - {Title: "initializing", Hidden: job.WorkerCounts.Initializing == 0}, - {Title: "stalled", Hidden: job.WorkerCounts.Stalled == 0}, - {Title: "running"}, - {Title: "failed", Hidden: job.WorkerCounts.Failed == 0}, - {Title: "succeeded"}, + {Title: "Requested"}, + {Title: "Pending", Hidden: job.WorkerCounts.Pending == 0}, + {Title: "Creating", Hidden: job.WorkerCounts.Creating == 0}, + {Title: "Ready"}, + {Title: "NotReady"}, + {Title: "ErrImagePull", Hidden: job.WorkerCounts.ErrImagePull == 0}, + {Title: "Terminating", Hidden: job.WorkerCounts.Terminating == 0}, + {Title: "Failed", Hidden: job.WorkerCounts.Failed == 0}, + {Title: "Killed", Hidden: job.WorkerCounts.Killed == 0}, + {Title: "KilledOOM", Hidden: job.WorkerCounts.KilledOOM == 0}, + {Title: "Stalled", Hidden: job.WorkerCounts.Stalled == 0}, + {Title: "Unknown", Hidden: job.WorkerCounts.Unknown == 0}, + {Title: "Succeeded"}, }, Rows: [][]interface{}{ { job.Workers, job.WorkerCounts.Pending, - job.WorkerCounts.Initializing, - job.WorkerCounts.Stalled, - job.WorkerCounts.Running, + job.WorkerCounts.Creating, + job.WorkerCounts.Ready, + job.WorkerCounts.NotReady, + job.WorkerCounts.ErrImagePull, + job.WorkerCounts.Terminating, job.WorkerCounts.Failed, + job.WorkerCounts.Killed, + job.WorkerCounts.KilledOOM, + job.WorkerCounts.Stalled, + job.WorkerCounts.Unknown, job.WorkerCounts.Succeeded, }, }, diff --git a/cli/cmd/lib_realtime_apis.go b/cli/cmd/lib_realtime_apis.go index 128e5df06d..dd73db1282 100644 --- a/cli/cmd/lib_realtime_apis.go +++ b/cli/cmd/lib_realtime_apis.go @@ -23,6 +23,7 @@ import ( "github.com/cortexlabs/cortex/cli/types/cliconfig" "github.com/cortexlabs/cortex/pkg/lib/console" + "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/table" libtime "github.com/cortexlabs/cortex/pkg/lib/time" "github.com/cortexlabs/cortex/pkg/operator/schema" @@ -53,6 +54,32 @@ func realtimeAPITable(realtimeAPI schema.APIResponse, env cliconfig.Environment) return out, nil } +func realtimeDescribeAPITable(realtimeAPI schema.APIResponse, env cliconfig.Environment) (string, error) { + if realtimeAPI.Metadata == nil { + return "", errors.ErrorUnexpected("missing metadata from operator response") + } + + if realtimeAPI.Status == nil { + return "", errors.ErrorUnexpected(fmt.Sprintf("missing status for %s api", realtimeAPI.Metadata.Name)) + } + + t := realtimeAPIsTable([]schema.APIResponse{realtimeAPI}, []string{env.Name}) + out := t.MustFormat() + + if realtimeAPI.DashboardURL != nil && *realtimeAPI.DashboardURL != "" { + out += "\n" + console.Bold("metrics dashboard: ") + *realtimeAPI.DashboardURL + "\n" + } + + if realtimeAPI.Endpoint != nil { + out += "\n" + console.Bold("endpoint: ") + *realtimeAPI.Endpoint + "\n" + } + + t = replicaCountTable(realtimeAPI.Status.ReplicaCounts) + out += "\n" + t.MustFormat() + + return out, nil +} + func realtimeAPIsTable(realtimeAPIs []schema.APIResponse, envNames []string) table.Table { rows := make([][]interface{}, 0, len(realtimeAPIs)) diff --git a/cli/cmd/lib_task_apis.go b/cli/cmd/lib_task_apis.go index cda53e18b8..49541aad77 100644 --- a/cli/cmd/lib_task_apis.go +++ b/cli/cmd/lib_task_apis.go @@ -181,22 +181,34 @@ func getTaskJob(env cliconfig.Environment, apiName string, jobID string) (string if job.WorkerCounts != nil { t := table.Table{ Headers: []table.Header{ - {Title: "requested"}, - {Title: "pending", Hidden: job.WorkerCounts.Pending == 0}, - {Title: "initializing", Hidden: job.WorkerCounts.Initializing == 0}, - {Title: "stalled", Hidden: job.WorkerCounts.Stalled == 0}, - {Title: "running"}, - {Title: "failed", Hidden: job.WorkerCounts.Failed == 0}, - {Title: "succeeded"}, + {Title: "Requested"}, + {Title: "Pending", Hidden: job.WorkerCounts.Pending == 0}, + {Title: "Creating", Hidden: job.WorkerCounts.Creating == 0}, + {Title: "Ready"}, + {Title: "NotReady"}, + {Title: "ErrImagePull", Hidden: job.WorkerCounts.ErrImagePull == 0}, + {Title: "Terminating", Hidden: job.WorkerCounts.Terminating == 0}, + {Title: "Failed", Hidden: job.WorkerCounts.Failed == 0}, + {Title: "Killed", Hidden: job.WorkerCounts.Killed == 0}, + {Title: "KilledOOM", Hidden: job.WorkerCounts.KilledOOM == 0}, + {Title: "Stalled", Hidden: job.WorkerCounts.Stalled == 0}, + {Title: "Unknown", Hidden: job.WorkerCounts.Unknown == 0}, + {Title: "Succeeded"}, }, Rows: [][]interface{}{ { job.Workers, job.WorkerCounts.Pending, - job.WorkerCounts.Initializing, - job.WorkerCounts.Stalled, - job.WorkerCounts.Running, + job.WorkerCounts.Creating, + job.WorkerCounts.Ready, + job.WorkerCounts.NotReady, + job.WorkerCounts.ErrImagePull, + job.WorkerCounts.Terminating, job.WorkerCounts.Failed, + job.WorkerCounts.Killed, + job.WorkerCounts.KilledOOM, + job.WorkerCounts.Stalled, + job.WorkerCounts.Unknown, job.WorkerCounts.Succeeded, }, }, diff --git a/cli/cmd/lib_watch.go b/cli/cmd/lib_watch.go index 06aebb26c2..a0f9043492 100644 --- a/cli/cmd/lib_watch.go +++ b/cli/cmd/lib_watch.go @@ -56,8 +56,8 @@ func watchHeader() string { return fmt.Sprintf("$ %s %s%s", _cmdStr, padding, libtime.LocalHourNow()) } -func rerun(f func() (string, error)) { - if _flagWatch { +func rerun(watchFlag bool, f func() (string, error)) { + if watchFlag { print("\033[H\033[2J") // clear the screen var prevStrSlice []string diff --git a/cli/cmd/root.go b/cli/cmd/root.go index 68649c0cc1..8aa7d1e0e0 100644 --- a/cli/cmd/root.go +++ b/cli/cmd/root.go @@ -112,6 +112,7 @@ func init() { clusterInit() completionInit() deleteInit() + describeInit() deployInit() envInit() getInit() @@ -154,6 +155,7 @@ func Execute() { _rootCmd.AddCommand(_deployCmd) _rootCmd.AddCommand(_getCmd) + _rootCmd.AddCommand(_describeCmd) _rootCmd.AddCommand(_logsCmd) _rootCmd.AddCommand(_refreshCmd) _rootCmd.AddCommand(_deleteCmd) diff --git a/cmd/operator/main.go b/cmd/operator/main.go index bf5a50d33b..ac38ee7130 100644 --- a/cmd/operator/main.go +++ b/cmd/operator/main.go @@ -105,6 +105,7 @@ func main() { routerWithAuth.HandleFunc("/get", endpoints.GetAPIs).Methods("GET") routerWithAuth.HandleFunc("/get/{apiName}", endpoints.GetAPI).Methods("GET") routerWithAuth.HandleFunc("/get/{apiName}/{apiID}", endpoints.GetAPIByID).Methods("GET") + routerWithAuth.HandleFunc("/describe/{apiName}", endpoints.DescribeAPI).Methods("GET") routerWithAuth.HandleFunc("/streamlogs/{apiName}", endpoints.ReadLogs) routerWithAuth.HandleFunc("/logs/{apiName}", endpoints.GetLogURL).Methods("GET") diff --git a/pkg/consts/consts.go b/pkg/consts/consts.go index 7ea590fc45..3fe860d776 100644 --- a/pkg/consts/consts.go +++ b/pkg/consts/consts.go @@ -76,8 +76,7 @@ var ( CortexProbeHeader = "X-Cortex-Probe" CortexOriginHeader = "X-Cortex-Origin" - WaitForInitializingReplicasTimeout = 15 * time.Minute - WaitForReadyReplicasTimeout = 20 * time.Minute + WaitForReadyReplicasTimeout = 20 * time.Minute ) func DefaultRegistry() string { diff --git a/pkg/crds/config/crd/bases/batch.cortex.dev_batchjobs.yaml b/pkg/crds/config/crd/bases/batch.cortex.dev_batchjobs.yaml index 63b1987bd9..a60ccbba4a 100644 --- a/pkg/crds/config/crd/bases/batch.cortex.dev_batchjobs.yaml +++ b/pkg/crds/config/crd/bases/batch.cortex.dev_batchjobs.yaml @@ -251,16 +251,28 @@ spec: worker_counts: description: Detailed worker counts with respective status properties: + creating: + format: int32 + type: integer + err_image_pull: + format: int32 + type: integer failed: format: int32 type: integer - initializing: + killed: + format: int32 + type: integer + killed_oom: + format: int32 + type: integer + not_ready: format: int32 type: integer pending: format: int32 type: integer - running: + ready: format: int32 type: integer stalled: @@ -269,6 +281,9 @@ spec: succeeded: format: int32 type: integer + terminating: + format: int32 + type: integer unknown: format: int32 type: integer diff --git a/pkg/crds/controllers/batch/batchjob_controller_helpers.go b/pkg/crds/controllers/batch/batchjob_controller_helpers.go index dd052dfc58..0785628452 100644 --- a/pkg/crds/controllers/batch/batchjob_controller_helpers.go +++ b/pkg/crds/controllers/batch/batchjob_controller_helpers.go @@ -442,6 +442,21 @@ func (r *BatchJobReconciler) getWorkerJob(ctx context.Context, batchJob batch.Ba return &job, nil } +func (r *BatchJobReconciler) getWorkerJobPods(ctx context.Context, batchJob batch.BatchJob) ([]kcore.Pod, error) { + workerJobPods := kcore.PodList{} + if err := r.List(ctx, &workerJobPods, + client.InNamespace(consts.DefaultNamespace), + client.MatchingLabels{ + "jobID": batchJob.Name, + "apiName": batchJob.Spec.APIName, + "apiID": batchJob.Spec.APIID, + }, + ); err != nil { + return nil, err + } + return workerJobPods.Items, nil +} + func (r *BatchJobReconciler) updateStatus(ctx context.Context, batchJob *batch.BatchJob, statusInfo batchJobStatusInfo) error { batchJob.Status.ID = batchJob.Name @@ -461,6 +476,11 @@ func (r *BatchJobReconciler) updateStatus(ctx context.Context, batchJob *batch.B batchJob.Status.TotalBatchCount = statusInfo.TotalBatchCount } + workerJobPods, err := r.getWorkerJobPods(ctx, *batchJob) + if err != nil { + return errors.Wrap(err, "failed to retrieve worker pods") + } + worker := statusInfo.WorkerJob if worker != nil { batchJob.Status.EndTime = worker.Status.CompletionTime // assign right away, because it's a pointer @@ -486,13 +506,11 @@ func (r *BatchJobReconciler) updateStatus(ctx context.Context, batchJob *batch.B } } - isWorkerOOM, err := r.checkWorkersOOM(ctx, batchJob) - if err != nil { - return err - } - - if isWorkerOOM { - batchJobStatus = status.JobWorkerOOM + for i := range workerJobPods { + if k8s.WasPodOOMKilled(&workerJobPods[i]) { + batchJobStatus = status.JobWorkerOOM + break + } } batchJob.Status.Status = batchJobStatus @@ -512,11 +530,36 @@ func (r *BatchJobReconciler) updateStatus(ctx context.Context, batchJob *batch.B batchJob.Status.Status = status.JobRunning } - batchJob.Status.WorkerCounts = &status.WorkerCounts{ - Running: worker.Status.Active, - Succeeded: worker.Status.Succeeded, - Failed: worker.Status.Failed, + // TODO move this to its own function + workerCounts := status.WorkerCounts{} + for i := range workerJobPods { + switch k8s.GetPodStatus(&workerJobPods[i]) { + case k8s.PodStatusPending: + workerCounts.Pending++ + case k8s.PodStatusStalled: + workerCounts.Stalled++ + case k8s.PodStatusCreating: + workerCounts.Creating++ + case k8s.PodStatusNotReady: + workerCounts.NotReady++ + case k8s.PodStatusErrImagePull: + workerCounts.ErrImagePull++ + case k8s.PodStatusTerminating: + workerCounts.Terminating++ + case k8s.PodStatusFailed: + workerCounts.Failed++ + case k8s.PodStatusKilled: + workerCounts.Killed++ + case k8s.PodStatusKilledOOM: + workerCounts.KilledOOM++ + case k8s.PodStatusSucceeded: + workerCounts.Succeeded++ + case k8s.PodStatusUnknown: + workerCounts.Unknown++ + } } + + batchJob.Status.WorkerCounts = &workerCounts } if err := r.Status().Update(ctx, batchJob); err != nil { @@ -526,27 +569,6 @@ func (r *BatchJobReconciler) updateStatus(ctx context.Context, batchJob *batch.B return nil } -func (r *BatchJobReconciler) checkWorkersOOM(ctx context.Context, batchJob *batch.BatchJob) (bool, error) { - workerJobPods := kcore.PodList{} - if err := r.List(ctx, &workerJobPods, - client.InNamespace(consts.DefaultNamespace), - client.MatchingLabels{ - "jobID": batchJob.Name, - "apiName": batchJob.Spec.APIName, - "apiID": batchJob.Spec.APIID, - }, - ); err != nil { - return false, err - } - - for i := range workerJobPods.Items { - if k8s.WasPodOOMKilled(&workerJobPods.Items[i]) { - return true, nil - } - } - return false, nil -} - func (r *BatchJobReconciler) deleteSQSQueue(batchJob batch.BatchJob) error { queueURL := r.getQueueURL(batchJob) input := sqs.DeleteQueueInput{QueueUrl: aws.String(queueURL)} diff --git a/pkg/lib/k8s/pod.go b/pkg/lib/k8s/pod.go index e841a7b8a8..2646783e88 100644 --- a/pkg/lib/k8s/pod.go +++ b/pkg/lib/k8s/pod.go @@ -23,6 +23,7 @@ import ( "time" "github.com/cortexlabs/cortex/pkg/lib/errors" + "github.com/cortexlabs/cortex/pkg/lib/pointer" "github.com/cortexlabs/cortex/pkg/lib/sets/strset" kcore "k8s.io/api/core/v1" kerrors "k8s.io/apimachinery/pkg/api/errors" @@ -45,37 +46,50 @@ const ( ReasonCompleted = "Completed" ) +type PodSpec struct { + Name string + K8sPodSpec kcore.PodSpec + Labels map[string]string + Annotations map[string]string +} + type PodStatus string const ( - PodStatusUnknown PodStatus = "Unknown" PodStatusPending PodStatus = "Pending" - PodStatusInitializing PodStatus = "Initializing" - PodStatusRunning PodStatus = "Running" - PodStatusErrImagePull PodStatus = "Image pull error" + PodStatusCreating PodStatus = "Creating" + PodStatusNotReady PodStatus = "NotReady" + PodStatusReady PodStatus = "Ready" + PodStatusErrImagePull PodStatus = "ErrImagePull" PodStatusTerminating PodStatus = "Terminating" - PodStatusSucceeded PodStatus = "Succeeded" PodStatusFailed PodStatus = "Failed" PodStatusKilled PodStatus = "Killed" - PodStatusKilledOOM PodStatus = "Out of Memory" + PodStatusKilledOOM PodStatus = "KilledOOM" + PodStatusStalled PodStatus = "Stalled" + + PodStatusSucceeded PodStatus = "Succeeded" + + PodStatusUnknown PodStatus = "Unknown" ) -var _killStatuses = map[int32]bool{ - 137: true, // SIGKILL - 143: true, // SIGTERM - 130: true, // SIGINT - 129: true, // SIGHUP -} +var ( + _killStatuses = map[int32]bool{ + 137: true, // SIGKILL + 143: true, // SIGTERM + 130: true, // SIGINT + 129: true, // SIGHUP + } -// https://github.com/kubernetes/kubernetes/blob/master/pkg/kubelet/images/types.go#L27 -var _imagePullErrorStrings = strset.New("ErrImagePull", "ImagePullBackOff", "RegistryUnavailable") + _evictedMemoryMessageRegex = regexp.MustCompile(`(?i)low\W+on\W+resource\W+memory`) -type PodSpec struct { - Name string - K8sPodSpec kcore.PodSpec - Labels map[string]string - Annotations map[string]string -} + // https://github.com/kubernetes/kubernetes/blob/master/pkg/kubelet/images/types.go#L27 + _imagePullErrorStrings = strset.New("ErrImagePull", "ImagePullBackOff", "RegistryUnavailable") + + // https://github.com/kubernetes/kubernetes/blob/9f47110aa29094ed2878cf1d85874cb59214664a/staging/src/k8s.io/api/core/v1/types.go#L76-L77 + _creatingReasons = strset.New("ContainerCreating", "PodInitializing") + + _waitForCreatingPodTimeout = time.Minute * 15 +) func Pod(spec *PodSpec) *kcore.Pod { pod := &kcore.Pod{ @@ -90,6 +104,26 @@ func Pod(spec *PodSpec) *kcore.Pod { return pod } +func GetPodConditionOf(pod *kcore.Pod, podType kcore.PodConditionType) *bool { + if pod == nil { + return nil + } + + var condition *bool + for _, podCondition := range pod.Status.Conditions { + if podCondition.Type == podType { + if podCondition.Status == kcore.ConditionTrue { + condition = pointer.Bool(true) + } + if podCondition.Status == kcore.ConditionFalse { + condition = pointer.Bool(false) + } + break + } + } + return condition +} + func (c *Client) CreatePod(pod *kcore.Pod) (*kcore.Pod, error) { pod.TypeMeta = _podTypeMeta pod, err := c.podClient.Create(context.Background(), pod, kmeta.CreateOptions{}) @@ -120,12 +154,28 @@ func (c *Client) ApplyPod(pod *kcore.Pod) (*kcore.Pod, error) { } func IsPodReady(pod *kcore.Pod) bool { - if GetPodStatus(pod) != PodStatusRunning { + if GetPodStatus(pod) != PodStatusReady { + return false + } + + // TODO use the GetPodConditionOf func here + for _, condition := range pod.Status.Conditions { + if condition.Type == kcore.PodReady && condition.Status == kcore.ConditionTrue { + return true + } + } + + return false +} + +func IsPodStalled(pod *kcore.Pod) bool { + if GetPodStatus(pod) != PodStatusPending { return false } + // TODO use the GetPodConditionOf func here for _, condition := range pod.Status.Conditions { - if condition.Type == "Ready" && condition.Status == kcore.ConditionTrue { + if condition.Type == kcore.PodScheduled && condition.Status == kcore.ConditionFalse && !condition.LastTransitionTime.Time.IsZero() && time.Since(condition.LastTransitionTime.Time) >= _waitForCreatingPodTimeout { return true } } @@ -137,7 +187,7 @@ func GetPodReadyTime(pod *kcore.Pod) *time.Time { for i := range pod.Status.Conditions { condition := pod.Status.Conditions[i] - if condition.Type == "Ready" && condition.Status == kcore.ConditionTrue { + if condition.Type == kcore.PodReady && condition.Status == kcore.ConditionTrue { if condition.LastTransitionTime.Time.IsZero() { return nil } @@ -148,8 +198,6 @@ func GetPodReadyTime(pod *kcore.Pod) *time.Time { return nil } -var _evictedMemoryMessageRegex = regexp.MustCompile(`(?i)low\W+on\W+resource\W+memory`) - func WasPodOOMKilled(pod *kcore.Pod) bool { if pod.Status.Reason == ReasonEvicted && _evictedMemoryMessageRegex.MatchString(pod.Status.Message) { return true @@ -176,15 +224,11 @@ func GetPodStatus(pod *kcore.Pod) PodStatus { switch pod.Status.Phase { case kcore.PodPending: - initPodStatus := PodStatusFromContainerStatuses(pod.Status.InitContainerStatuses) - if initPodStatus == PodStatusRunning { - return PodStatusInitializing - } - allPodStatus := PodStatusFromContainerStatuses(append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...)) - if allPodStatus == PodStatusErrImagePull { - return PodStatusErrImagePull + podCondition := GetPodConditionOf(pod, kcore.PodScheduled) + if podCondition != nil && !*podCondition { + return PodStatusStalled } - return PodStatusPending + return PodStatusFromContainerStatuses(append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...)) case kcore.PodSucceeded: return PodStatusSucceeded case kcore.PodFailed: @@ -215,7 +259,17 @@ func GetPodStatus(pod *kcore.Pod) PodStatus { return PodStatusTerminating } - return PodStatusFromContainerStatuses(pod.Status.ContainerStatuses) + podCondition := GetPodConditionOf(pod, kcore.PodReady) + if podCondition != nil && *podCondition { + return PodStatusReady + } + + status := PodStatusFromContainerStatuses(pod.Status.ContainerStatuses) + if status == PodStatusReady || status == PodStatusNotReady { + return PodStatusNotReady + } + + return status default: return PodStatusUnknown } @@ -224,7 +278,9 @@ func GetPodStatus(pod *kcore.Pod) PodStatus { func PodStatusFromContainerStatuses(containerStatuses []kcore.ContainerStatus) PodStatus { numContainers := len(containerStatuses) numWaiting := 0 - numRunning := 0 + numCreating := 0 + numNotReady := 0 + numReady := 0 numSucceeded := 0 numFailed := 0 numKilled := 0 @@ -235,9 +291,9 @@ func PodStatusFromContainerStatuses(containerStatuses []kcore.ContainerStatus) P } for _, containerStatus := range containerStatuses { if containerStatus.State.Running != nil && containerStatus.Ready { - numRunning++ - } else if containerStatus.State.Running != nil && containerStatus.RestartCount == 0 { - numRunning++ + numReady++ + } else if containerStatus.State.Running != nil && !containerStatus.Ready { + numNotReady++ } else if containerStatus.State.Terminated != nil { exitCode := containerStatus.State.Terminated.ExitCode reason := containerStatus.State.Terminated.Reason @@ -264,6 +320,8 @@ func PodStatusFromContainerStatuses(containerStatuses []kcore.ContainerStatus) P } } else if containerStatus.State.Waiting != nil && _imagePullErrorStrings.Has(containerStatus.State.Waiting.Reason) { return PodStatusErrImagePull + } else if containerStatus.State.Waiting != nil && _creatingReasons.Has(containerStatus.State.Waiting.Reason) { + numCreating++ } else { // either containerStatus.State.Waiting != nil or all containerStatus.States are nil (which implies waiting) numWaiting++ @@ -279,8 +337,12 @@ func PodStatusFromContainerStatuses(containerStatuses []kcore.ContainerStatus) P return PodStatusPending } else if numSucceeded == numContainers { return PodStatusSucceeded + } else if numCreating > 0 { + return PodStatusCreating + } else if numNotReady > 0 { + return PodStatusNotReady } else { - return PodStatusRunning + return PodStatusReady } } diff --git a/pkg/operator/endpoints/describe.go b/pkg/operator/endpoints/describe.go new file mode 100644 index 0000000000..b574d5eefc --- /dev/null +++ b/pkg/operator/endpoints/describe.go @@ -0,0 +1,36 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package endpoints + +import ( + "net/http" + + "github.com/cortexlabs/cortex/pkg/operator/resources" + "github.com/gorilla/mux" +) + +func DescribeAPI(w http.ResponseWriter, r *http.Request) { + apiName := mux.Vars(r)["apiName"] + + response, err := resources.DescribeAPI(apiName) + if err != nil { + respondError(w, r, err) + return + } + + respondJSON(w, r, response) +} diff --git a/pkg/operator/operator/k8s.go b/pkg/operator/operator/k8s.go index b85cb81b9e..43e36168c9 100644 --- a/pkg/operator/operator/k8s.go +++ b/pkg/operator/operator/k8s.go @@ -22,6 +22,7 @@ import ( "github.com/cortexlabs/cortex/pkg/config" "github.com/cortexlabs/cortex/pkg/lib/urls" "github.com/cortexlabs/cortex/pkg/types/spec" + "github.com/cortexlabs/cortex/pkg/types/userconfig" ) // APILoadBalancerURL returns the http endpoint of the ingress load balancer for deployed APIs @@ -64,8 +65,12 @@ func APIEndpoint(api *spec.API) (string, error) { return urls.Join(baseAPIEndpoint, *api.Networking.Endpoint), nil } -func APIEndpointFromPath(apiNetworkingPath string) (string, error) { - var err error +func APIEndpointFromResource(deployedResource *DeployedResource) (string, error) { + apiEndpoint, err := userconfig.EndpointFromAnnotation(deployedResource.VirtualService) + if err != nil { + return "", err + } + baseAPIEndpoint := "" baseAPIEndpoint, err = APILoadBalancerURL() @@ -74,5 +79,5 @@ func APIEndpointFromPath(apiNetworkingPath string) (string, error) { } baseAPIEndpoint = strings.Replace(baseAPIEndpoint, "https://", "http://", 1) - return urls.Join(baseAPIEndpoint, apiNetworkingPath), nil + return urls.Join(baseAPIEndpoint, apiEndpoint), nil } diff --git a/pkg/operator/resources/asyncapi/api.go b/pkg/operator/resources/asyncapi/api.go index 26553ecab8..9766811445 100644 --- a/pkg/operator/resources/asyncapi/api.go +++ b/pkg/operator/resources/asyncapi/api.go @@ -59,6 +59,12 @@ type resources struct { gatewayVirtualService *istioclientnetworking.VirtualService } +// TODO remove this +type asyncDeployments struct { + APIDeployment *kapps.Deployment + GatewayDeployment *kapps.Deployment +} + func getGatewayK8sName(apiName string) string { return "gateway-" + apiName } @@ -309,36 +315,87 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp return nil, errors.ErrorUnexpected("unable to find gateway deployment", deployedResource.Name) } - deployment, err := config.K8s.GetDeployment(workloads.K8sName(deployedResource.Name)) + apiStatus := status.StatusFromDeployment(apiDeployment) + apiMetadata, err := spec.MetadataFromDeployment(apiDeployment) + if err != nil { + return nil, errors.ErrorUnexpected("unable to obtain metadata", deployedResource.Name) + } + + api, err := operator.DownloadAPISpec(apiMetadata.Name, apiMetadata.APIID) + if err != nil { + return nil, err + } + + apiEndpoint, err := operator.APIEndpoint(api) + if err != nil { + return nil, err + } + + dashboardURL := pointer.String(getDashboardURL(api.Name)) + + return []schema.APIResponse{ + { + Spec: api, + Metadata: apiMetadata, + Status: apiStatus, + Endpoint: &apiEndpoint, + DashboardURL: dashboardURL, + }, + }, nil +} + +func DescribeAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResponse, error) { + var apiDeployment *kapps.Deployment + var gatewayDeployment *kapps.Deployment + + err := parallel.RunFirstErr( + func() error { + var err error + apiDeployment, err = config.K8s.GetDeployment(workloads.K8sName(deployedResource.Name)) + return err + }, + func() error { + var err error + gatewayDeployment, err = config.K8s.GetDeployment(getGatewayK8sName(deployedResource.Name)) + return err + }, + ) if err != nil { return nil, err } - if deployment == nil { - return nil, errors.ErrorUnexpected("unable to find deployment", deployedResource.Name) + if apiDeployment == nil { + return nil, errors.ErrorUnexpected("unable to find api deployment", deployedResource.Name) } - apiStatus := status.StatusFromDeployment(deployment) - apiMetadata, err := spec.MetadataFromDeployment(deployment) + if gatewayDeployment == nil { + return nil, errors.ErrorUnexpected("unable to find gateway deployment", deployedResource.Name) + } + + apiStatus := status.StatusFromDeployment(apiDeployment) + apiMetadata, err := spec.MetadataFromDeployment(apiDeployment) if err != nil { return nil, errors.ErrorUnexpected("unable to obtain metadata", deployedResource.Name) } - api, err := operator.DownloadAPISpec(apiMetadata.Name, apiMetadata.APIID) + apiPods, err := config.K8s.ListPodsByLabels(map[string]string{ + "apiName": apiDeployment.Labels["apiName"], + "cortex.dev/async": "api", + }) if err != nil { return nil, err } + apiStatus.ReplicaCounts = GetReplicaCounts(apiDeployment, apiPods) - apiEndpoint, err := operator.APIEndpoint(api) + apiEndpoint, err := operator.APIEndpointFromResource(deployedResource) if err != nil { return nil, err } - dashboardURL := pointer.String(getDashboardURL(api.Name)) + dashboardURL := pointer.String(getDashboardURL(deployedResource.Name)) return []schema.APIResponse{ { - Spec: api, Metadata: apiMetadata, Status: apiStatus, Endpoint: &apiEndpoint, @@ -584,6 +641,58 @@ func deleteK8sResources(apiName string) error { return err } +// let's do CRDs instead, to avoid this +func groupDeploymentsByAPI(deployments []kapps.Deployment) map[string]*asyncDeployments { + deploymentsByAPI := map[string]*asyncDeployments{} + for i := range deployments { + deployment := deployments[i] + apiName := deployment.Labels["apiName"] + asyncType := deployment.Labels["cortex.dev/async"] + apiResources, exists := deploymentsByAPI[apiName] + if exists { + if asyncType == "api" { + apiResources.APIDeployment = &deployment + } else { + apiResources.GatewayDeployment = &deployment + } + } else { + if asyncType == "api" { + deploymentsByAPI[apiName] = &asyncDeployments{APIDeployment: &deployment} + } else { + deploymentsByAPI[apiName] = &asyncDeployments{GatewayDeployment: &deployment} + } + } + } + return deploymentsByAPI +} + +// returns true if min_replicas are not ready and no updated replicas have errored +func isAPIUpdating(deployment *kapps.Deployment) (bool, error) { + pods, err := config.K8s.ListPodsByLabel("apiName", deployment.Labels["apiName"]) + if err != nil { + return false, err + } + + replicaCounts := GetReplicaCounts(deployment, pods) + + autoscalingSpec, err := userconfig.AutoscalingFromAnnotations(deployment) + if err != nil { + return false, err + } + + if replicaCounts.Ready < autoscalingSpec.MinReplicas && replicaCounts.TotalFailed() == 0 { + return true, nil + } + + return false, nil +} + +func isPodSpecLatest(deployment *kapps.Deployment, pod *kcore.Pod) bool { + // Note: the gateway deployment/pods don't have "podID" or "deploymentID" labels, which is ok since it is always up-to-date + return deployment.Spec.Template.Labels["podID"] == pod.Labels["podID"] && + deployment.Spec.Template.Labels["deploymentID"] == pod.Labels["deploymentID"] +} + func getDashboardURL(apiName string) string { loadBalancerURL, err := operator.LoadBalancerURL() if err != nil { diff --git a/pkg/operator/resources/asyncapi/status.go b/pkg/operator/resources/asyncapi/status.go index 48189a89cb..0268f26048 100644 --- a/pkg/operator/resources/asyncapi/status.go +++ b/pkg/operator/resources/asyncapi/status.go @@ -17,98 +17,13 @@ limitations under the License. package asyncapi import ( - "time" - - "github.com/cortexlabs/cortex/pkg/config" - "github.com/cortexlabs/cortex/pkg/consts" "github.com/cortexlabs/cortex/pkg/lib/k8s" "github.com/cortexlabs/cortex/pkg/types/status" - "github.com/cortexlabs/cortex/pkg/types/userconfig" kapps "k8s.io/api/apps/v1" kcore "k8s.io/api/core/v1" ) -type asyncDeployments struct { - APIDeployment *kapps.Deployment - GatewayDeployment *kapps.Deployment -} - -// let's do CRDs instead, to avoid this -func groupDeploymentsByAPI(deployments []kapps.Deployment) map[string]*asyncDeployments { - deploymentsByAPI := map[string]*asyncDeployments{} - for i := range deployments { - deployment := deployments[i] - apiName := deployment.Labels["apiName"] - asyncType := deployment.Labels["cortex.dev/async"] - apiResources, exists := deploymentsByAPI[apiName] - if exists { - if asyncType == "api" { - apiResources.APIDeployment = &deployment - } else { - apiResources.GatewayDeployment = &deployment - } - } else { - if asyncType == "api" { - deploymentsByAPI[apiName] = &asyncDeployments{APIDeployment: &deployment} - } else { - deploymentsByAPI[apiName] = &asyncDeployments{GatewayDeployment: &deployment} - } - } - } - return deploymentsByAPI -} - -func getStatusCode(apiCounts status.ReplicaCounts, gatewayCounts status.ReplicaCounts, apiMinReplicas int32) status.Code { - if apiCounts.Updated.Ready >= apiCounts.Requested && gatewayCounts.Updated.Ready >= 1 { - return status.Live - } - - if apiCounts.Updated.ErrImagePull > 0 || gatewayCounts.Updated.ErrImagePull > 0 { - return status.ErrorImagePull - } - - if apiCounts.Updated.Failed > 0 || apiCounts.Updated.Killed > 0 || - gatewayCounts.Updated.Failed > 0 || gatewayCounts.Updated.Killed > 0 { - return status.Error - } - - if apiCounts.Updated.KilledOOM > 0 || gatewayCounts.Updated.KilledOOM > 0 { - return status.OOM - } - - if apiCounts.Updated.Stalled > 0 || gatewayCounts.Updated.Stalled > 0 { - return status.Stalled - } - - if apiCounts.Updated.Ready >= apiMinReplicas && gatewayCounts.Updated.Ready >= 1 { - return status.Live - } - - return status.Updating -} - -// returns true if min_replicas are not ready and no updated replicas have errored -func isAPIUpdating(deployment *kapps.Deployment) (bool, error) { - pods, err := config.K8s.ListPodsByLabel("apiName", deployment.Labels["apiName"]) - if err != nil { - return false, err - } - - replicaCounts := getReplicaCounts(deployment, pods) - - autoscalingSpec, err := userconfig.AutoscalingFromAnnotations(deployment) - if err != nil { - return false, err - } - - if replicaCounts.Updated.Ready < autoscalingSpec.MinReplicas && replicaCounts.Updated.TotalFailed() == 0 { - return true, nil - } - - return false, nil -} - -func getReplicaCounts(deployment *kapps.Deployment, pods []kcore.Pod) status.ReplicaCounts { +func GetReplicaCounts(deployment *kapps.Deployment, pods []kcore.Pod) *status.ReplicaCounts { counts := status.ReplicaCounts{} counts.Requested = *deployment.Spec.Replicas @@ -121,50 +36,44 @@ func getReplicaCounts(deployment *kapps.Deployment, pods []kcore.Pod) status.Rep addPodToReplicaCounts(&pod, deployment, &counts) } - return counts + return &counts } func addPodToReplicaCounts(pod *kcore.Pod, deployment *kapps.Deployment, counts *status.ReplicaCounts) { - var subCounts *status.SubReplicaCounts + latest := false if isPodSpecLatest(deployment, pod) { - subCounts = &counts.Updated - } else { - subCounts = &counts.Stale + latest = true } - if k8s.IsPodReady(pod) { - subCounts.Ready++ + isPodReady := k8s.IsPodReady(pod) + if latest && isPodReady { + counts.Ready++ + return + } else if !latest && isPodReady { + counts.ReadyOutOfDate++ return } switch k8s.GetPodStatus(pod) { case k8s.PodStatusPending: - if time.Since(pod.CreationTimestamp.Time) > consts.WaitForInitializingReplicasTimeout { - subCounts.Stalled++ - } else { - subCounts.Pending++ - } - case k8s.PodStatusInitializing: - subCounts.Initializing++ - case k8s.PodStatusRunning: - subCounts.Initializing++ + counts.Pending++ + case k8s.PodStatusStalled: + counts.Stalled++ + case k8s.PodStatusCreating: + counts.Creating++ + case k8s.PodStatusReady: + counts.Creating++ case k8s.PodStatusErrImagePull: - subCounts.ErrImagePull++ + counts.ErrImagePull++ case k8s.PodStatusTerminating: - subCounts.Terminating++ + counts.Terminating++ case k8s.PodStatusFailed: - subCounts.Failed++ + counts.Failed++ case k8s.PodStatusKilled: - subCounts.Killed++ + counts.Killed++ case k8s.PodStatusKilledOOM: - subCounts.KilledOOM++ + counts.KilledOOM++ default: - subCounts.Unknown++ + counts.Unknown++ } } - -func isPodSpecLatest(deployment *kapps.Deployment, pod *kcore.Pod) bool { - // Note: the gateway deployment/pods don't have "podID" or "deploymentID" labels, which is ok since it is always up-to-date - return deployment.Spec.Template.Labels["podID"] == pod.Labels["podID"] && - deployment.Spec.Template.Labels["deploymentID"] == pod.Labels["deploymentID"] -} diff --git a/pkg/operator/resources/job/worker_stats.go b/pkg/operator/resources/job/worker_stats.go index 07628995e4..797d65980e 100644 --- a/pkg/operator/resources/job/worker_stats.go +++ b/pkg/operator/resources/job/worker_stats.go @@ -17,9 +17,6 @@ limitations under the License. package job import ( - "time" - - "github.com/cortexlabs/cortex/pkg/consts" "github.com/cortexlabs/cortex/pkg/lib/k8s" "github.com/cortexlabs/cortex/pkg/types/status" kbatch "k8s.io/api/batch/v1" @@ -43,34 +40,32 @@ func GetWorkerCountsForJob(k8sJob kbatch.Job, pods []kcore.Pod) status.WorkerCou func addPodToWorkerCounts(pod *kcore.Pod, workerCounts *status.WorkerCounts) { if k8s.IsPodReady(pod) { - workerCounts.Running++ + workerCounts.Ready++ return } switch k8s.GetPodStatus(pod) { case k8s.PodStatusPending: - if time.Since(pod.CreationTimestamp.Time) > consts.WaitForInitializingReplicasTimeout { - workerCounts.Stalled++ - } else { - workerCounts.Pending++ - } - case k8s.PodStatusInitializing: - workerCounts.Initializing++ - case k8s.PodStatusRunning: - workerCounts.Initializing++ + workerCounts.Pending++ + case k8s.PodStatusStalled: + workerCounts.Stalled++ + case k8s.PodStatusCreating: + workerCounts.Creating++ + case k8s.PodStatusNotReady: + workerCounts.NotReady++ case k8s.PodStatusErrImagePull: - workerCounts.Failed++ + workerCounts.ErrImagePull++ case k8s.PodStatusTerminating: - workerCounts.Failed++ + workerCounts.Terminating++ case k8s.PodStatusFailed: workerCounts.Failed++ case k8s.PodStatusKilled: - workerCounts.Failed++ + workerCounts.Killed++ case k8s.PodStatusKilledOOM: - workerCounts.Failed++ + workerCounts.KilledOOM++ case k8s.PodStatusSucceeded: workerCounts.Succeeded++ - default: + case k8s.PodStatusUnknown: workerCounts.Unknown++ } } diff --git a/pkg/operator/resources/realtimeapi/api.go b/pkg/operator/resources/realtimeapi/api.go index d787f5c99d..b7df138a30 100644 --- a/pkg/operator/resources/realtimeapi/api.go +++ b/pkg/operator/resources/realtimeapi/api.go @@ -242,6 +242,45 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp }, nil } +func DescribeAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResponse, error) { + deployment, err := config.K8s.GetDeployment(workloads.K8sName(deployedResource.Name)) + if err != nil { + return nil, err + } + + if deployment == nil { + return nil, errors.ErrorUnexpected("unable to find deployment", deployedResource.Name) + } + + apiStatus := status.StatusFromDeployment(deployment) + apiMetadata, err := spec.MetadataFromDeployment(deployment) + if err != nil { + return nil, errors.ErrorUnexpected("unable to obtain metadata", deployedResource.Name) + } + + pods, err := config.K8s.ListPodsByLabel("apiName", deployment.Labels["apiName"]) + if err != nil { + return nil, err + } + apiStatus.ReplicaCounts = GetReplicaCounts(deployment, pods) + + apiEndpoint, err := operator.APIEndpointFromResource(deployedResource) + if err != nil { + return nil, err + } + + dashboardURL := pointer.String(getDashboardURL(deployedResource.Name)) + + return []schema.APIResponse{ + { + Metadata: apiMetadata, + Status: apiStatus, + Endpoint: &apiEndpoint, + DashboardURL: dashboardURL, + }, + }, nil +} + func getK8sResources(apiName string) (*kapps.Deployment, *kcore.Service, *istioclientnetworking.VirtualService, error) { var deployment *kapps.Deployment var service *kcore.Service @@ -360,14 +399,14 @@ func isAPIUpdating(deployment *kapps.Deployment) (bool, error) { return false, err } - replicaCounts := getReplicaCounts(deployment, pods) + replicaCounts := GetReplicaCounts(deployment, pods) autoscalingSpec, err := userconfig.AutoscalingFromAnnotations(deployment) if err != nil { return false, err } - if replicaCounts.Updated.Ready < autoscalingSpec.MinReplicas && replicaCounts.Updated.TotalFailed() == 0 { + if replicaCounts.Ready < autoscalingSpec.MinReplicas && replicaCounts.TotalFailed() == 0 { return true, nil } diff --git a/pkg/operator/resources/realtimeapi/status.go b/pkg/operator/resources/realtimeapi/status.go index 9952ccd9ca..d021cc19f9 100644 --- a/pkg/operator/resources/realtimeapi/status.go +++ b/pkg/operator/resources/realtimeapi/status.go @@ -17,16 +17,13 @@ limitations under the License. package realtimeapi import ( - "time" - - "github.com/cortexlabs/cortex/pkg/consts" "github.com/cortexlabs/cortex/pkg/lib/k8s" "github.com/cortexlabs/cortex/pkg/types/status" kapps "k8s.io/api/apps/v1" kcore "k8s.io/api/core/v1" ) -func getReplicaCounts(deployment *kapps.Deployment, pods []kcore.Pod) status.ReplicaCounts { +func GetReplicaCounts(deployment *kapps.Deployment, pods []kcore.Pod) *status.ReplicaCounts { counts := status.ReplicaCounts{} counts.Requested = *deployment.Spec.Replicas @@ -38,72 +35,44 @@ func getReplicaCounts(deployment *kapps.Deployment, pods []kcore.Pod) status.Rep addPodToReplicaCounts(&pods[i], deployment, &counts) } - return counts + return &counts } func addPodToReplicaCounts(pod *kcore.Pod, deployment *kapps.Deployment, counts *status.ReplicaCounts) { - var subCounts *status.SubReplicaCounts + latest := false if isPodSpecLatest(deployment, pod) { - subCounts = &counts.Updated - } else { - subCounts = &counts.Stale + latest = true } - if k8s.IsPodReady(pod) { - subCounts.Ready++ + isPodReady := k8s.IsPodReady(pod) + if latest && isPodReady { + counts.Ready++ + return + } else if !latest && isPodReady { + counts.ReadyOutOfDate++ return } switch k8s.GetPodStatus(pod) { case k8s.PodStatusPending: - if time.Since(pod.CreationTimestamp.Time) > consts.WaitForInitializingReplicasTimeout { - subCounts.Stalled++ - } else { - subCounts.Pending++ - } - case k8s.PodStatusInitializing: - subCounts.Initializing++ - case k8s.PodStatusRunning: - subCounts.Initializing++ + counts.Pending++ + case k8s.PodStatusStalled: + counts.Stalled++ + case k8s.PodStatusCreating: + counts.Creating++ + case k8s.PodStatusReady: + counts.Creating++ case k8s.PodStatusErrImagePull: - subCounts.ErrImagePull++ + counts.ErrImagePull++ case k8s.PodStatusTerminating: - subCounts.Terminating++ + counts.Terminating++ case k8s.PodStatusFailed: - subCounts.Failed++ + counts.Failed++ case k8s.PodStatusKilled: - subCounts.Killed++ + counts.Killed++ case k8s.PodStatusKilledOOM: - subCounts.KilledOOM++ + counts.KilledOOM++ default: - subCounts.Unknown++ + counts.Unknown++ } } - -func getStatusCode(counts *status.ReplicaCounts, minReplicas int32) status.Code { - if counts.Updated.Ready >= counts.Requested { - return status.Live - } - - if counts.Updated.ErrImagePull > 0 { - return status.ErrorImagePull - } - - if counts.Updated.Failed > 0 || counts.Updated.Killed > 0 { - return status.Error - } - - if counts.Updated.KilledOOM > 0 { - return status.OOM - } - - if counts.Updated.Stalled > 0 { - return status.Stalled - } - - if counts.Updated.Ready >= minReplicas { - return status.Live - } - - return status.Updating -} diff --git a/pkg/operator/resources/resources.go b/pkg/operator/resources/resources.go index 5350f99e47..445571ad25 100644 --- a/pkg/operator/resources/resources.go +++ b/pkg/operator/resources/resources.go @@ -308,9 +308,6 @@ func GetAPIs() ([]schema.APIResponse, error) { } } - fmt.Println("realtimeAPIDeployments", len(realtimeAPIDeployments)) - fmt.Println("asyncAPIDeployments", len(asyncAPIDeployments)) - var batchAPIVirtualServices []istioclientnetworking.VirtualService var taskAPIVirtualServices []istioclientnetworking.VirtualService var trafficSplitterVirtualServices []istioclientnetworking.VirtualService @@ -486,3 +483,33 @@ func checkIfUsedByTrafficSplitter(apiName string) error { } return nil } + +func DescribeAPI(apiName string) ([]schema.APIResponse, error) { + deployedResource, err := GetDeployedResourceByName(apiName) + if err != nil { + return nil, err + } + + var apiResponse []schema.APIResponse + + switch deployedResource.Kind { + case userconfig.RealtimeAPIKind: + apiResponse, err = realtimeapi.DescribeAPIByName(deployedResource) + if err != nil { + return nil, err + } + case userconfig.AsyncAPIKind: + apiResponse, err = asyncapi.DescribeAPIByName(deployedResource) + if err != nil { + return nil, err + } + default: + return nil, ErrorOperationIsOnlySupportedForKind( + *deployedResource, + userconfig.RealtimeAPIKind, + userconfig.AsyncAPIKind, + ) // unexpected + } + + return apiResponse, nil +} diff --git a/pkg/types/status/code.go b/pkg/types/status/code.go deleted file mode 100644 index 41a8a13d91..0000000000 --- a/pkg/types/status/code.go +++ /dev/null @@ -1,97 +0,0 @@ -/* -Copyright 2021 Cortex Labs, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package status - -type Code int - -const ( - Unknown Code = iota - Stalled - Error - ErrorImagePull - OOM - Live - Updating -) - -var _codes = []string{ - "status_unknown", - "status_stalled", - "status_error", - "status_error_image_pull", - "status_oom", - "status_live", - "status_updating", -} - -var _ = [1]int{}[int(Updating)-(len(_codes)-1)] // Ensure list length matches - -var _codeMessages = []string{ - "unknown", // Unknown - "compute unavailable", // Stalled - "error", // Error - "error (image pull)", // Live - "error (out of memory)", // OOM - "live", // Live - "updating", // Updating -} - -var _ = [1]int{}[int(Updating)-(len(_codeMessages)-1)] // Ensure list length matches - -func (code Code) String() string { - if int(code) < 0 || int(code) >= len(_codes) { - return _codes[Unknown] - } - return _codes[code] -} - -func (code Code) Message() string { - if int(code) < 0 || int(code) >= len(_codeMessages) { - return _codeMessages[Unknown] - } - return _codeMessages[code] -} - -// MarshalText satisfies TextMarshaler -func (code Code) MarshalText() ([]byte, error) { - return []byte(code.String()), nil -} - -// UnmarshalText satisfies TextUnmarshaler -func (code *Code) UnmarshalText(text []byte) error { - enum := string(text) - for i := 0; i < len(_codes); i++ { - if enum == _codes[i] { - *code = Code(i) - return nil - } - } - - *code = Unknown - return nil -} - -// UnmarshalBinary satisfies BinaryUnmarshaler -// Needed for msgpack -func (code *Code) UnmarshalBinary(data []byte) error { - return code.UnmarshalText(data) -} - -// MarshalBinary satisfies BinaryMarshaler -func (code Code) MarshalBinary() ([]byte, error) { - return []byte(code.String()), nil -} diff --git a/pkg/types/status/status.go b/pkg/types/status/status.go index 8a6a4dd160..e3faba9e9f 100644 --- a/pkg/types/status/status.go +++ b/pkg/types/status/status.go @@ -21,37 +21,66 @@ import ( ) type Status struct { - Ready int32 `json:"ready"` - Requested int32 `json:"requested"` - UpToDate int32 `json:"up_to_date"` + Ready int32 `json:"ready"` + Requested int32 `json:"requested"` + UpToDate int32 `json:"up_to_date"` + ReplicaCounts *ReplicaCounts `json:"replica_counts,omitempty"` } -type ReplicaCounts struct { - Updated SubReplicaCounts `json:"updated"` - Stale SubReplicaCounts `json:"stale"` - Requested int32 `json:"requested"` +type ReplicaCountType string + +const ( + ReplicaCountRequested ReplicaCountType = "Requested" + ReplicaCountPending ReplicaCountType = "Pending" + ReplicaCountCreating ReplicaCountType = "Creating" + ReplicaCountNotReady ReplicaCountType = "NotReady" + ReplicaCountReady ReplicaCountType = "Ready" + ReplicaCountReadyOutOfDate ReplicaCountType = "ReadyOutOfDate" + ReplicaCountErrImagePull ReplicaCountType = "ErrImagePull" + ReplicaCountTerminating ReplicaCountType = "Terminating" + ReplicaCountFailed ReplicaCountType = "Failed" + ReplicaCountKilled ReplicaCountType = "Killed" + ReplicaCountKilledOOM ReplicaCountType = "KilledOOM" + ReplicaCountStalled ReplicaCountType = "Stalled" + ReplicaCountUnknown ReplicaCountType = "Unknown" +) + +var ReplicaCountTypes []ReplicaCountType = []ReplicaCountType{ + ReplicaCountRequested, ReplicaCountPending, ReplicaCountCreating, + ReplicaCountNotReady, ReplicaCountReady, ReplicaCountReadyOutOfDate, + ReplicaCountErrImagePull, ReplicaCountTerminating, ReplicaCountFailed, + ReplicaCountKilled, ReplicaCountKilledOOM, ReplicaCountStalled, + ReplicaCountUnknown, } -type SubReplicaCounts struct { - Pending int32 `json:"pending"` - Initializing int32 `json:"initializing"` - Ready int32 `json:"ready"` - ErrImagePull int32 `json:"err_image_pull"` - Terminating int32 `json:"terminating"` - Failed int32 `json:"failed"` - Killed int32 `json:"killed"` - KilledOOM int32 `json:"killed_oom"` - Stalled int32 `json:"stalled"` // pending for a long time - Unknown int32 `json:"unknown"` +type ReplicaCounts struct { + Requested int32 `json:"requested"` + Pending int32 `json:"pending"` + Creating int32 `json:"creating"` + NotReady int32 `json:"not_ready"` + Ready int32 `json:"ready"` + ReadyOutOfDate int32 `json:"ready_out_of_date"` + ErrImagePull int32 `json:"err_image_pull"` + Terminating int32 `json:"terminating"` + Failed int32 `json:"failed"` + Killed int32 `json:"killed"` + KilledOOM int32 `json:"killed_oom"` + Stalled int32 `json:"stalled"` // pending for a long time + Unknown int32 `json:"unknown"` } // Worker counts don't have as many failure variations because Jobs clean up dead pods, so counting different failure scenarios isn't interesting type WorkerCounts struct { Pending int32 `json:"pending,omitempty"` - Initializing int32 `json:"initializing,omitempty"` - Running int32 `json:"running,omitempty"` + Creating int32 `json:"creating,omitempty"` + NotReady int32 `json:"not_ready,omitempty"` + Ready int32 `json:"ready,omitempty"` Succeeded int32 `json:"succeeded,omitempty"` + ErrImagePull int32 `json:"err_image_pull,omitempty"` + Terminating int32 `json:"terminating,omitempty"` Failed int32 `json:"failed,omitempty"` + Killed int32 `json:"killed,omitempty"` + KilledOOM int32 `json:"killed_oom,omitempty"` Stalled int32 `json:"stalled,omitempty"` // pending for a long time Unknown int32 `json:"unknown,omitempty"` } @@ -64,6 +93,36 @@ func StatusFromDeployment(deployment *kapps.Deployment) *Status { } } -func (src *SubReplicaCounts) TotalFailed() int32 { - return src.Failed + src.ErrImagePull + src.Killed + src.KilledOOM + src.Stalled +func (counts *ReplicaCounts) GetCountBy(replicaType ReplicaCountType) int32 { + switch replicaType { + case ReplicaCountRequested: + return counts.Requested + case ReplicaCountPending: + return counts.Pending + case ReplicaCountCreating: + return counts.Creating + case ReplicaCountNotReady: + return counts.NotReady + case ReplicaCountReady: + return counts.Ready + case ReplicaCountReadyOutOfDate: + return counts.ReadyOutOfDate + case ReplicaCountErrImagePull: + return counts.ErrImagePull + case ReplicaCountTerminating: + return counts.Terminating + case ReplicaCountFailed: + return counts.Failed + case ReplicaCountKilled: + return counts.Killed + case ReplicaCountKilledOOM: + return counts.KilledOOM + case ReplicaCountStalled: + return counts.Stalled + } + return counts.Unknown +} + +func (counts *ReplicaCounts) TotalFailed() int32 { + return counts.ErrImagePull + counts.Failed + counts.Killed + counts.KilledOOM + counts.Unknown } diff --git a/pkg/types/userconfig/api.go b/pkg/types/userconfig/api.go index 1872187f8c..d1e9f65f7b 100644 --- a/pkg/types/userconfig/api.go +++ b/pkg/types/userconfig/api.go @@ -257,6 +257,14 @@ func TrafficSplitterTargetsFromAnnotations(k8sObj kmeta.Object) (int32, error) { return targets, nil } +func EndpointFromAnnotation(k8sObj kmeta.Object) (string, error) { + endpoint, err := k8s.GetAnnotation(k8sObj, EndpointAnnotationKey) + if err != nil { + return "", err + } + return endpoint, nil +} + func (api *API) UserStr() string { var sb strings.Builder sb.WriteString(fmt.Sprintf("%s: %s\n", NameKey, api.Name)) From fb1512205f6c5f4a07a6e18d8051c40956d0ae2f Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Sat, 24 Jul 2021 01:32:50 +0300 Subject: [PATCH 28/40] Fixes --- cli/cluster/delete.go | 4 +- cli/cmd/describe.go | 2 +- cli/cmd/get.go | 2 +- cli/cmd/lib_batch_apis.go | 4 +- cli/cmd/lib_task_apis.go | 4 +- pkg/lib/k8s/pod.go | 32 +++++++++------- pkg/operator/resources/asyncapi/api.go | 37 ++----------------- pkg/operator/resources/asyncapi/status.go | 8 +++- pkg/operator/resources/job/batchapi/api.go | 6 +-- pkg/operator/resources/job/taskapi/api.go | 6 +-- pkg/operator/resources/realtimeapi/api.go | 6 +-- pkg/operator/resources/realtimeapi/status.go | 8 +++- pkg/operator/resources/trafficsplitter/api.go | 8 ++-- pkg/types/status/status.go | 2 +- 14 files changed, 57 insertions(+), 72 deletions(-) diff --git a/cli/cluster/delete.go b/cli/cluster/delete.go index 7b1d96d86d..47618b304b 100644 --- a/cli/cluster/delete.go +++ b/cli/cluster/delete.go @@ -22,6 +22,7 @@ import ( "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/json" + "github.com/cortexlabs/cortex/pkg/lib/pointer" "github.com/cortexlabs/cortex/pkg/lib/prompt" s "github.com/cortexlabs/cortex/pkg/lib/strings" "github.com/cortexlabs/cortex/pkg/operator/schema" @@ -70,8 +71,7 @@ func getReadyRealtimeAPIReplicasOrNil(operatorConfig OperatorConfig, apiName str return nil } - totalReady := apiRes.Status.Ready - return &totalReady + return pointer.Int32(apiRes.Status.Ready) } func StopJob(operatorConfig OperatorConfig, kind userconfig.Kind, apiName string, jobID string) (schema.DeleteResponse, error) { diff --git a/cli/cmd/describe.go b/cli/cmd/describe.go index be23ef6532..767045c5a2 100644 --- a/cli/cmd/describe.go +++ b/cli/cmd/describe.go @@ -97,7 +97,7 @@ func describeAPI(env cliconfig.Environment, apiName string) (string, error) { } if len(apisRes) == 0 { - exit.Error(errors.ErrorUnexpected(fmt.Sprintf("unable to find API %s", apiName))) + exit.Error(errors.ErrorUnexpected(fmt.Sprintf("unable to find api %s", apiName))) } apiRes := apisRes[0] diff --git a/cli/cmd/get.go b/cli/cmd/get.go index e0083dfa99..0fac23d076 100644 --- a/cli/cmd/get.go +++ b/cli/cmd/get.go @@ -464,7 +464,7 @@ func getAPI(env cliconfig.Environment, apiName string) (string, error) { } if len(apisRes) == 0 { - exit.Error(errors.ErrorUnexpected(fmt.Sprintf("unable to find API %s", apiName))) + exit.Error(errors.ErrorUnexpected(fmt.Sprintf("unable to find api %s", apiName))) } apiRes := apisRes[0] diff --git a/cli/cmd/lib_batch_apis.go b/cli/cmd/lib_batch_apis.go index 5cebcdd2ba..17cdb1301c 100644 --- a/cli/cmd/lib_batch_apis.go +++ b/cli/cmd/lib_batch_apis.go @@ -222,8 +222,8 @@ func getBatchJob(env cliconfig.Environment, apiName string, jobID string) (strin t := table.Table{ Headers: []table.Header{ {Title: "Requested"}, - {Title: "Pending", Hidden: job.WorkerCounts.Pending == 0}, - {Title: "Creating", Hidden: job.WorkerCounts.Creating == 0}, + {Title: "Pending"}, + {Title: "Creating"}, {Title: "Ready"}, {Title: "NotReady"}, {Title: "ErrImagePull", Hidden: job.WorkerCounts.ErrImagePull == 0}, diff --git a/cli/cmd/lib_task_apis.go b/cli/cmd/lib_task_apis.go index 49541aad77..6c3d2b8383 100644 --- a/cli/cmd/lib_task_apis.go +++ b/cli/cmd/lib_task_apis.go @@ -182,8 +182,8 @@ func getTaskJob(env cliconfig.Environment, apiName string, jobID string) (string t := table.Table{ Headers: []table.Header{ {Title: "Requested"}, - {Title: "Pending", Hidden: job.WorkerCounts.Pending == 0}, - {Title: "Creating", Hidden: job.WorkerCounts.Creating == 0}, + {Title: "Pending"}, + {Title: "Creating"}, {Title: "Ready"}, {Title: "NotReady"}, {Title: "ErrImagePull", Hidden: job.WorkerCounts.ErrImagePull == 0}, diff --git a/pkg/lib/k8s/pod.go b/pkg/lib/k8s/pod.go index 2646783e88..0d3e0125bd 100644 --- a/pkg/lib/k8s/pod.go +++ b/pkg/lib/k8s/pod.go @@ -19,6 +19,7 @@ package k8s import ( "bytes" "context" + "fmt" "regexp" "time" @@ -104,24 +105,26 @@ func Pod(spec *PodSpec) *kcore.Pod { return pod } -func GetPodConditionOf(pod *kcore.Pod, podType kcore.PodConditionType) *bool { +func GetPodConditionOf(pod *kcore.Pod, podType kcore.PodConditionType) (*bool, *kcore.PodCondition) { if pod == nil { - return nil + return nil, nil } - var condition *bool - for _, podCondition := range pod.Status.Conditions { - if podCondition.Type == podType { - if podCondition.Status == kcore.ConditionTrue { - condition = pointer.Bool(true) + var conditionState *bool + var condition *kcore.PodCondition + for i := range pod.Status.Conditions { + if pod.Status.Conditions[i].Type == podType { + if pod.Status.Conditions[i].Status == kcore.ConditionTrue { + conditionState = pointer.Bool(true) } - if podCondition.Status == kcore.ConditionFalse { - condition = pointer.Bool(false) + if pod.Status.Conditions[i].Status == kcore.ConditionFalse { + conditionState = pointer.Bool(false) } + condition = &pod.Status.Conditions[i] break } } - return condition + return conditionState, condition } func (c *Client) CreatePod(pod *kcore.Pod) (*kcore.Pod, error) { @@ -176,6 +179,7 @@ func IsPodStalled(pod *kcore.Pod) bool { // TODO use the GetPodConditionOf func here for _, condition := range pod.Status.Conditions { if condition.Type == kcore.PodScheduled && condition.Status == kcore.ConditionFalse && !condition.LastTransitionTime.Time.IsZero() && time.Since(condition.LastTransitionTime.Time) >= _waitForCreatingPodTimeout { + fmt.Println(time.Since(condition.LastTransitionTime.Time), _waitForCreatingPodTimeout) return true } } @@ -224,8 +228,8 @@ func GetPodStatus(pod *kcore.Pod) PodStatus { switch pod.Status.Phase { case kcore.PodPending: - podCondition := GetPodConditionOf(pod, kcore.PodScheduled) - if podCondition != nil && !*podCondition { + podConditionState, podCondition := GetPodConditionOf(pod, kcore.PodScheduled) + if podConditionState != nil && !*podConditionState && !podCondition.LastTransitionTime.Time.IsZero() && time.Since(podCondition.LastTransitionTime.Time) >= _waitForCreatingPodTimeout { return PodStatusStalled } return PodStatusFromContainerStatuses(append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...)) @@ -259,8 +263,8 @@ func GetPodStatus(pod *kcore.Pod) PodStatus { return PodStatusTerminating } - podCondition := GetPodConditionOf(pod, kcore.PodReady) - if podCondition != nil && *podCondition { + podConditionState, _ := GetPodConditionOf(pod, kcore.PodReady) + if podConditionState != nil && *podConditionState { return PodStatusReady } diff --git a/pkg/operator/resources/asyncapi/api.go b/pkg/operator/resources/asyncapi/api.go index 9766811445..0c6ba6b190 100644 --- a/pkg/operator/resources/asyncapi/api.go +++ b/pkg/operator/resources/asyncapi/api.go @@ -59,12 +59,6 @@ type resources struct { gatewayVirtualService *istioclientnetworking.VirtualService } -// TODO remove this -type asyncDeployments struct { - APIDeployment *kapps.Deployment - GatewayDeployment *kapps.Deployment -} - func getGatewayK8sName(apiName string) string { return "gateway-" + apiName } @@ -274,7 +268,7 @@ func GetAllAPIs(deployments []kapps.Deployment) ([]schema.APIResponse, error) { return nil, errors.Wrap(err, fmt.Sprintf("api %s", apiName)) } mappedAsyncAPIs[apiName] = schema.APIResponse{ - Status: status.StatusFromDeployment(&deployments[i]), + Status: status.FromDeployment(&deployments[i]), Metadata: metadata, } } @@ -315,7 +309,7 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp return nil, errors.ErrorUnexpected("unable to find gateway deployment", deployedResource.Name) } - apiStatus := status.StatusFromDeployment(apiDeployment) + apiStatus := status.FromDeployment(apiDeployment) apiMetadata, err := spec.MetadataFromDeployment(apiDeployment) if err != nil { return nil, errors.ErrorUnexpected("unable to obtain metadata", deployedResource.Name) @@ -372,7 +366,7 @@ func DescribeAPIByName(deployedResource *operator.DeployedResource) ([]schema.AP return nil, errors.ErrorUnexpected("unable to find gateway deployment", deployedResource.Name) } - apiStatus := status.StatusFromDeployment(apiDeployment) + apiStatus := status.FromDeployment(apiDeployment) apiMetadata, err := spec.MetadataFromDeployment(apiDeployment) if err != nil { return nil, errors.ErrorUnexpected("unable to obtain metadata", deployedResource.Name) @@ -641,31 +635,6 @@ func deleteK8sResources(apiName string) error { return err } -// let's do CRDs instead, to avoid this -func groupDeploymentsByAPI(deployments []kapps.Deployment) map[string]*asyncDeployments { - deploymentsByAPI := map[string]*asyncDeployments{} - for i := range deployments { - deployment := deployments[i] - apiName := deployment.Labels["apiName"] - asyncType := deployment.Labels["cortex.dev/async"] - apiResources, exists := deploymentsByAPI[apiName] - if exists { - if asyncType == "api" { - apiResources.APIDeployment = &deployment - } else { - apiResources.GatewayDeployment = &deployment - } - } else { - if asyncType == "api" { - deploymentsByAPI[apiName] = &asyncDeployments{APIDeployment: &deployment} - } else { - deploymentsByAPI[apiName] = &asyncDeployments{GatewayDeployment: &deployment} - } - } - } - return deploymentsByAPI -} - // returns true if min_replicas are not ready and no updated replicas have errored func isAPIUpdating(deployment *kapps.Deployment) (bool, error) { pods, err := config.K8s.ListPodsByLabel("apiName", deployment.Labels["apiName"]) diff --git a/pkg/operator/resources/asyncapi/status.go b/pkg/operator/resources/asyncapi/status.go index 0268f26048..3a0e4b5c1a 100644 --- a/pkg/operator/resources/asyncapi/status.go +++ b/pkg/operator/resources/asyncapi/status.go @@ -54,6 +54,10 @@ func addPodToReplicaCounts(pod *kcore.Pod, deployment *kapps.Deployment, counts return } + if !latest { + return + } + switch k8s.GetPodStatus(pod) { case k8s.PodStatusPending: counts.Pending++ @@ -63,6 +67,8 @@ func addPodToReplicaCounts(pod *kcore.Pod, deployment *kapps.Deployment, counts counts.Creating++ case k8s.PodStatusReady: counts.Creating++ + case k8s.PodStatusNotReady: + counts.NotReady++ case k8s.PodStatusErrImagePull: counts.ErrImagePull++ case k8s.PodStatusTerminating: @@ -73,7 +79,7 @@ func addPodToReplicaCounts(pod *kcore.Pod, deployment *kapps.Deployment, counts counts.Killed++ case k8s.PodStatusKilledOOM: counts.KilledOOM++ - default: + case k8s.PodStatusUnknown: counts.Unknown++ } } diff --git a/pkg/operator/resources/job/batchapi/api.go b/pkg/operator/resources/job/batchapi/api.go index 8d87040eef..6ac1c87219 100644 --- a/pkg/operator/resources/job/batchapi/api.go +++ b/pkg/operator/resources/job/batchapi/api.go @@ -140,9 +140,9 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, batchJob apiNameToBatchJobsMap[batchJob.Spec.APIName] = append(apiNameToBatchJobsMap[batchJob.Spec.APIName], &batchJobList[i]) } - for _, virtualService := range virtualServices { - apiName := virtualService.Labels["apiName"] - metadata, err := spec.MetadataFromVirtualService(&virtualService) + for i := range virtualServices { + apiName := virtualServices[i].Labels["apiName"] + metadata, err := spec.MetadataFromVirtualService(&virtualServices[i]) if err != nil { return nil, errors.Wrap(err, fmt.Sprintf("api %s", apiName)) } diff --git a/pkg/operator/resources/job/taskapi/api.go b/pkg/operator/resources/job/taskapi/api.go index 6c6afdf425..c5ca6e17fa 100644 --- a/pkg/operator/resources/job/taskapi/api.go +++ b/pkg/operator/resources/job/taskapi/api.go @@ -146,10 +146,10 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, k8sJobs } } - for _, virtualService := range virtualServices { - apiName := virtualService.Labels["apiName"] + for i := range virtualServices { + apiName := virtualServices[i].Labels["apiName"] - metadata, err := spec.MetadataFromVirtualService(&virtualService) + metadata, err := spec.MetadataFromVirtualService(&virtualServices[i]) if err != nil { return nil, errors.Wrap(err, fmt.Sprintf("api %s", apiName)) } diff --git a/pkg/operator/resources/realtimeapi/api.go b/pkg/operator/resources/realtimeapi/api.go index b7df138a30..885c661366 100644 --- a/pkg/operator/resources/realtimeapi/api.go +++ b/pkg/operator/resources/realtimeapi/api.go @@ -190,7 +190,7 @@ func GetAllAPIs(deployments []kapps.Deployment) ([]schema.APIResponse, error) { return nil, errors.Wrap(err, fmt.Sprintf("api %s", apiName)) } mappedRealtimeAPIs[apiName] = schema.APIResponse{ - Status: status.StatusFromDeployment(&deployments[i]), + Status: status.FromDeployment(&deployments[i]), Metadata: metadata, } } @@ -213,7 +213,7 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp return nil, errors.ErrorUnexpected("unable to find deployment", deployedResource.Name) } - apiStatus := status.StatusFromDeployment(deployment) + apiStatus := status.FromDeployment(deployment) apiMetadata, err := spec.MetadataFromDeployment(deployment) if err != nil { return nil, errors.ErrorUnexpected("unable to obtain metadata", deployedResource.Name) @@ -252,7 +252,7 @@ func DescribeAPIByName(deployedResource *operator.DeployedResource) ([]schema.AP return nil, errors.ErrorUnexpected("unable to find deployment", deployedResource.Name) } - apiStatus := status.StatusFromDeployment(deployment) + apiStatus := status.FromDeployment(deployment) apiMetadata, err := spec.MetadataFromDeployment(deployment) if err != nil { return nil, errors.ErrorUnexpected("unable to obtain metadata", deployedResource.Name) diff --git a/pkg/operator/resources/realtimeapi/status.go b/pkg/operator/resources/realtimeapi/status.go index d021cc19f9..160ebea638 100644 --- a/pkg/operator/resources/realtimeapi/status.go +++ b/pkg/operator/resources/realtimeapi/status.go @@ -53,6 +53,10 @@ func addPodToReplicaCounts(pod *kcore.Pod, deployment *kapps.Deployment, counts return } + if !latest { + return + } + switch k8s.GetPodStatus(pod) { case k8s.PodStatusPending: counts.Pending++ @@ -62,6 +66,8 @@ func addPodToReplicaCounts(pod *kcore.Pod, deployment *kapps.Deployment, counts counts.Creating++ case k8s.PodStatusReady: counts.Creating++ + case k8s.PodStatusNotReady: + counts.NotReady++ case k8s.PodStatusErrImagePull: counts.ErrImagePull++ case k8s.PodStatusTerminating: @@ -72,7 +78,7 @@ func addPodToReplicaCounts(pod *kcore.Pod, deployment *kapps.Deployment, counts counts.Killed++ case k8s.PodStatusKilledOOM: counts.KilledOOM++ - default: + case k8s.PodStatusUnknown: counts.Unknown++ } } diff --git a/pkg/operator/resources/trafficsplitter/api.go b/pkg/operator/resources/trafficsplitter/api.go index fa02f8ec35..03c89ea4a1 100644 --- a/pkg/operator/resources/trafficsplitter/api.go +++ b/pkg/operator/resources/trafficsplitter/api.go @@ -134,15 +134,15 @@ func getTrafficSplitterDestinations(trafficSplitter *spec.API) []k8s.Destination // GetAllAPIs returns a list of metadata, in the form of schema.APIResponse, about all the created traffic splitter APIs func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService) ([]schema.APIResponse, error) { var trafficSplitters []schema.APIResponse - for _, virtualService := range virtualServices { - apiName := virtualService.Labels["apiName"] + for i := range virtualServices { + apiName := virtualServices[i].Labels["apiName"] - metadata, err := spec.MetadataFromVirtualService(&virtualService) + metadata, err := spec.MetadataFromVirtualService(&virtualServices[i]) if err != nil { return nil, errors.Wrap(err, fmt.Sprintf("api %s", apiName)) } - targets, err := userconfig.TrafficSplitterTargetsFromAnnotations(&virtualService) + targets, err := userconfig.TrafficSplitterTargetsFromAnnotations(&virtualServices[i]) if err != nil { return nil, errors.Wrap(err, fmt.Sprintf("api %s", apiName)) } diff --git a/pkg/types/status/status.go b/pkg/types/status/status.go index e3faba9e9f..b9b3258119 100644 --- a/pkg/types/status/status.go +++ b/pkg/types/status/status.go @@ -85,7 +85,7 @@ type WorkerCounts struct { Unknown int32 `json:"unknown,omitempty"` } -func StatusFromDeployment(deployment *kapps.Deployment) *Status { +func FromDeployment(deployment *kapps.Deployment) *Status { return &Status{ Ready: deployment.Status.ReadyReplicas, Requested: deployment.Status.Replicas, From c5b0e735e9cd8657eeacdee404687be9a5d2b040 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Sat, 24 Jul 2021 02:09:46 +0300 Subject: [PATCH 29/40] Fix terminating status --- pkg/operator/resources/asyncapi/status.go | 11 ++++++++--- pkg/operator/resources/realtimeapi/status.go | 11 ++++++++--- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/pkg/operator/resources/asyncapi/status.go b/pkg/operator/resources/asyncapi/status.go index 3a0e4b5c1a..69977c731e 100644 --- a/pkg/operator/resources/asyncapi/status.go +++ b/pkg/operator/resources/asyncapi/status.go @@ -54,11 +54,18 @@ func addPodToReplicaCounts(pod *kcore.Pod, deployment *kapps.Deployment, counts return } + podStatus := k8s.GetPodStatus(pod) + + if podStatus == k8s.PodStatusTerminating { + counts.Terminating++ + return + } + if !latest { return } - switch k8s.GetPodStatus(pod) { + switch podStatus { case k8s.PodStatusPending: counts.Pending++ case k8s.PodStatusStalled: @@ -71,8 +78,6 @@ func addPodToReplicaCounts(pod *kcore.Pod, deployment *kapps.Deployment, counts counts.NotReady++ case k8s.PodStatusErrImagePull: counts.ErrImagePull++ - case k8s.PodStatusTerminating: - counts.Terminating++ case k8s.PodStatusFailed: counts.Failed++ case k8s.PodStatusKilled: diff --git a/pkg/operator/resources/realtimeapi/status.go b/pkg/operator/resources/realtimeapi/status.go index 160ebea638..0a88b83d17 100644 --- a/pkg/operator/resources/realtimeapi/status.go +++ b/pkg/operator/resources/realtimeapi/status.go @@ -53,11 +53,18 @@ func addPodToReplicaCounts(pod *kcore.Pod, deployment *kapps.Deployment, counts return } + podStatus := k8s.GetPodStatus(pod) + + if podStatus == k8s.PodStatusTerminating { + counts.Terminating++ + return + } + if !latest { return } - switch k8s.GetPodStatus(pod) { + switch podStatus { case k8s.PodStatusPending: counts.Pending++ case k8s.PodStatusStalled: @@ -70,8 +77,6 @@ func addPodToReplicaCounts(pod *kcore.Pod, deployment *kapps.Deployment, counts counts.NotReady++ case k8s.PodStatusErrImagePull: counts.ErrImagePull++ - case k8s.PodStatusTerminating: - counts.Terminating++ case k8s.PodStatusFailed: counts.Failed++ case k8s.PodStatusKilled: From 8edee7ab58eee7fc2d731517e3acf5cced3a811c Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Mon, 26 Jul 2021 18:01:01 +0300 Subject: [PATCH 30/40] Fix the worker counts for the batch jobs --- pkg/crds/controllers/batch/batchjob_controller_helpers.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pkg/crds/controllers/batch/batchjob_controller_helpers.go b/pkg/crds/controllers/batch/batchjob_controller_helpers.go index 0785628452..191d5a9b0f 100644 --- a/pkg/crds/controllers/batch/batchjob_controller_helpers.go +++ b/pkg/crds/controllers/batch/batchjob_controller_helpers.go @@ -447,9 +447,10 @@ func (r *BatchJobReconciler) getWorkerJobPods(ctx context.Context, batchJob batc if err := r.List(ctx, &workerJobPods, client.InNamespace(consts.DefaultNamespace), client.MatchingLabels{ - "jobID": batchJob.Name, - "apiName": batchJob.Spec.APIName, - "apiID": batchJob.Spec.APIID, + "jobID": batchJob.Name, + "apiName": batchJob.Spec.APIName, + "apiID": batchJob.Spec.APIID, + "cortex.dev/batch": "worker", }, ); err != nil { return nil, err From a0ec6dfa8588bbf4aeead3885171ff308215e073 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Mon, 26 Jul 2021 22:25:17 +0300 Subject: [PATCH 31/40] Output type (yaml) fixes --- cli/cmd/get.go | 47 +++++++++++++++++---------- cli/cmd/lib_batch_apis.go | 14 +++++--- cli/cmd/lib_task_apis.go | 14 +++++--- pkg/operator/schema/schema.go | 36 ++++++++++----------- pkg/types/spec/api.go | 22 ++++++------- pkg/types/spec/job.go | 40 +++++++++++------------ pkg/types/status/job_status.go | 14 ++++---- pkg/types/status/status.go | 58 +++++++++++++++++----------------- 8 files changed, 136 insertions(+), 109 deletions(-) diff --git a/cli/cmd/get.go b/cli/cmd/get.go index 0fac23d076..c260d0c5e9 100644 --- a/cli/cmd/get.go +++ b/cli/cmd/get.go @@ -35,6 +35,7 @@ import ( libtime "github.com/cortexlabs/cortex/pkg/lib/time" "github.com/cortexlabs/cortex/pkg/operator/schema" "github.com/cortexlabs/cortex/pkg/types/userconfig" + "github.com/cortexlabs/yaml" "github.com/spf13/cobra" ) @@ -104,7 +105,7 @@ var _getCmd = &cobra.Command{ return "", err } - if _flagOutput == flags.JSONOutputType { + if _flagOutput == flags.JSONOutputType || _flagOutput == flags.YAMLOutputType { return apiTable, nil } @@ -134,7 +135,7 @@ var _getCmd = &cobra.Command{ if err != nil { return "", err } - if _flagOutput == flags.JSONOutputType { + if _flagOutput == flags.JSONOutputType || _flagOutput == flags.YAMLOutputType { return jobTable, nil } @@ -164,7 +165,7 @@ var _getCmd = &cobra.Command{ return "", err } - if _flagOutput == flags.JSONOutputType { + if _flagOutput == flags.JSONOutputType || _flagOutput == flags.YAMLOutputType { return apiTable, nil } @@ -245,12 +246,16 @@ func getAPIsInAllEnvironments() (string, error) { allAPIsOutput = append(allAPIsOutput, apisOutput) } + var bytes []byte if _flagOutput == flags.JSONOutputType { - bytes, err := libjson.Marshal(allAPIsOutput) - if err != nil { - return "", err - } - + bytes, err = libjson.Marshal(allAPIsOutput) + } else if _flagOutput == flags.YAMLOutputType { + bytes, err = yaml.Marshal(allAPIsOutput) + } + if err != nil { + return "", err + } + if _flagOutput == flags.JSONOutputType || _flagOutput == flags.YAMLOutputType { return string(bytes), nil } @@ -335,11 +340,16 @@ func getAPIsByEnv(env cliconfig.Environment) (string, error) { return "", err } + var bytes []byte if _flagOutput == flags.JSONOutputType { - bytes, err := libjson.Marshal(apisRes) - if err != nil { - return "", err - } + bytes, err = libjson.Marshal(apisRes) + } else if _flagOutput == flags.YAMLOutputType { + bytes, err = yaml.Marshal(apisRes) + } + if err != nil { + return "", err + } + if _flagOutput == flags.JSONOutputType || _flagOutput == flags.YAMLOutputType { return string(bytes), nil } @@ -455,11 +465,16 @@ func getAPI(env cliconfig.Environment, apiName string) (string, error) { return "", err } + var bytes []byte if _flagOutput == flags.JSONOutputType { - bytes, err := libjson.Marshal(apisRes) - if err != nil { - return "", err - } + bytes, err = libjson.Marshal(apisRes) + } else if _flagOutput == flags.YAMLOutputType { + bytes, err = yaml.Marshal(apisRes) + } + if err != nil { + return "", err + } + if _flagOutput == flags.JSONOutputType || _flagOutput == flags.YAMLOutputType { return string(bytes), nil } diff --git a/cli/cmd/lib_batch_apis.go b/cli/cmd/lib_batch_apis.go index 17cdb1301c..ebabc29243 100644 --- a/cli/cmd/lib_batch_apis.go +++ b/cli/cmd/lib_batch_apis.go @@ -31,6 +31,7 @@ import ( libtime "github.com/cortexlabs/cortex/pkg/lib/time" "github.com/cortexlabs/cortex/pkg/operator/schema" "github.com/cortexlabs/cortex/pkg/types/status" + "github.com/cortexlabs/yaml" ) const ( @@ -147,11 +148,16 @@ func getBatchJob(env cliconfig.Environment, apiName string, jobID string) (strin return "", err } + var bytes []byte if _flagOutput == flags.JSONOutputType { - bytes, err := libjson.Marshal(resp) - if err != nil { - return "", err - } + bytes, err = libjson.Marshal(resp) + } else if _flagOutput == flags.YAMLOutputType { + bytes, err = yaml.Marshal(resp) + } + if err != nil { + return "", err + } + if _flagOutput == flags.JSONOutputType || _flagOutput == flags.YAMLOutputType { return string(bytes), nil } diff --git a/cli/cmd/lib_task_apis.go b/cli/cmd/lib_task_apis.go index 6c3d2b8383..295e1af875 100644 --- a/cli/cmd/lib_task_apis.go +++ b/cli/cmd/lib_task_apis.go @@ -29,6 +29,7 @@ import ( "github.com/cortexlabs/cortex/pkg/lib/table" libtime "github.com/cortexlabs/cortex/pkg/lib/time" "github.com/cortexlabs/cortex/pkg/operator/schema" + "github.com/cortexlabs/yaml" ) const ( @@ -142,11 +143,16 @@ func getTaskJob(env cliconfig.Environment, apiName string, jobID string) (string return "", err } + var bytes []byte if _flagOutput == flags.JSONOutputType { - bytes, err := libjson.Marshal(resp) - if err != nil { - return "", err - } + bytes, err = libjson.Marshal(resp) + } else if _flagOutput == flags.YAMLOutputType { + bytes, err = yaml.Marshal(resp) + } + if err != nil { + return "", err + } + if _flagOutput == flags.JSONOutputType || _flagOutput == flags.YAMLOutputType { return string(bytes), nil } diff --git a/pkg/operator/schema/schema.go b/pkg/operator/schema/schema.go index 522a927a2f..70809e1a07 100644 --- a/pkg/operator/schema/schema.go +++ b/pkg/operator/schema/schema.go @@ -50,20 +50,20 @@ type NodeInfo struct { } type DeployResult struct { - API *APIResponse `json:"api"` - Message string `json:"message"` - Error string `json:"error"` + API *APIResponse `json:"api" yaml:"api"` + Message string `json:"message" yaml:"message"` + Error string `json:"error" yaml:"error"` } type APIResponse struct { - Spec *spec.API `json:"spec,omitempty"` - Metadata *spec.Metadata `json:"metadata,omitempty"` - Status *status.Status `json:"status,omitempty"` - Endpoint *string `json:"endpoint,omitempty"` - DashboardURL *string `json:"dashboard_url,omitempty"` - BatchJobStatuses []status.BatchJobStatus `json:"batch_job_statuses,omitempty"` - TaskJobStatuses []status.TaskJobStatus `json:"task_job_statuses,omitempty"` - APIVersions []APIVersion `json:"api_versions,omitempty"` + Spec *spec.API `json:"spec,omitempty" yaml:"spec,omitempty"` + Metadata *spec.Metadata `json:"metadata,omitempty" yaml:"metadata,omitempty"` + Status *status.Status `json:"status,omitempty" yaml:"status,omitempty"` + Endpoint *string `json:"endpoint,omitempty" yaml:"endpoint,omitempty"` + DashboardURL *string `json:"dashboard_url,omitempty" yaml:"dashboard_url,omitempty"` + BatchJobStatuses []status.BatchJobStatus `json:"batch_job_statuses,omitempty" yaml:"batch_job_statuses,omitempty"` + TaskJobStatuses []status.TaskJobStatus `json:"task_job_statuses,omitempty" yaml:"task_job_statuses,omitempty"` + APIVersions []APIVersion `json:"api_versions,omitempty" yaml:"api_versions,omitempty"` } type LogResponse struct { @@ -71,16 +71,16 @@ type LogResponse struct { } type BatchJobResponse struct { - APISpec spec.API `json:"api_spec"` - JobStatus status.BatchJobStatus `json:"job_status"` - Metrics *metrics.BatchMetrics `json:"metrics,omitempty"` - Endpoint string `json:"endpoint"` + APISpec spec.API `json:"api_spec" yaml:"api_spec"` + JobStatus status.BatchJobStatus `json:"job_status" yaml:"job_status"` + Metrics *metrics.BatchMetrics `json:"metrics,omitempty" yaml:"metrics,omitempty"` + Endpoint string `json:"endpoint" yaml:"endpoint"` } type TaskJobResponse struct { - APISpec spec.API `json:"api_spec"` - JobStatus status.TaskJobStatus `json:"job_status"` - Endpoint string `json:"endpoint"` + APISpec spec.API `json:"api_spec" yaml:"api_spec"` + JobStatus status.TaskJobStatus `json:"job_status" yaml:"job_status"` + Endpoint string `json:"endpoint" yaml:"endpoint"` } type DeleteResponse struct { diff --git a/pkg/types/spec/api.go b/pkg/types/spec/api.go index b229962aee..5b0d39210c 100644 --- a/pkg/types/spec/api.go +++ b/pkg/types/spec/api.go @@ -36,23 +36,23 @@ import ( type API struct { *userconfig.API - ID string `json:"id"` - SpecID string `json:"spec_id"` - PodID string `json:"pod_id"` - DeploymentID string `json:"deployment_id"` + ID string `json:"id" yaml:"id"` + SpecID string `json:"spec_id" yaml:"spec_id"` + PodID string `json:"pod_id" yaml:"pod_id"` + DeploymentID string `json:"deployment_id" yaml:"deployment_id"` - Key string `json:"key"` + Key string `json:"key" yaml:"key"` - InitialDeploymentTime int64 `json:"initial_deployment_time"` - LastUpdated int64 `json:"last_updated"` - MetadataRoot string `json:"metadata_root"` + InitialDeploymentTime int64 `json:"initial_deployment_time" yaml:"initial_deployment_time"` + LastUpdated int64 `json:"last_updated" yaml:"last_updated"` + MetadataRoot string `json:"metadata_root" yaml:"metadata_root"` } type Metadata struct { *userconfig.Resource - APIID string `json:"id"` - DeploymentID string `json:"deployment_id,omitempty"` - LastUpdated int64 `json:"last_updated"` + APIID string `json:"id" yaml:"id"` + DeploymentID string `json:"deployment_id,omitempty" yaml:"deployment_id,omitempty"` + LastUpdated int64 `json:"last_updated" yaml:"last_updated"` } func MetadataFromDeployment(deployment *kapps.Deployment) (*Metadata, error) { diff --git a/pkg/types/spec/job.go b/pkg/types/spec/job.go index 784fb4f199..d6c6cb354d 100644 --- a/pkg/types/spec/job.go +++ b/pkg/types/spec/job.go @@ -32,9 +32,9 @@ const ( ) type JobKey struct { - ID string `json:"job_id"` - APIName string `json:"api_name"` - Kind userconfig.Kind `json:"kind"` + ID string `json:"job_id" yaml:"job_id"` + APIName string `json:"api_name" yaml:"api_name"` + Kind userconfig.Kind `json:"kind" yaml:"kind"` } func (j JobKey) UserString() string { @@ -56,39 +56,39 @@ func (j JobKey) K8sName() string { } type SQSDeadLetterQueue struct { - ARN string `json:"arn"` - MaxReceiveCount int `json:"max_receive_count"` + ARN string `json:"arn" yaml:"arn"` + MaxReceiveCount int `json:"max_receive_count" yaml:"max_receive_count"` } type RuntimeBatchJobConfig struct { - Workers int `json:"workers"` - SQSDeadLetterQueue *SQSDeadLetterQueue `json:"sqs_dead_letter_queue"` - Config map[string]interface{} `json:"config"` - Timeout *int `json:"timeout"` + Workers int `json:"workers" yaml:"workers"` + SQSDeadLetterQueue *SQSDeadLetterQueue `json:"sqs_dead_letter_queue" yaml:"sqs_dead_letter_queue"` + Config map[string]interface{} `json:"config" yaml:"config"` + Timeout *int `json:"timeout" yaml:"timeout"` } type RuntimeTaskJobConfig struct { - Workers int `json:"workers"` - Config map[string]interface{} `json:"config"` - Timeout *int `json:"timeout"` + Workers int `json:"workers" yaml:"workers"` + Config map[string]interface{} `json:"config" yaml:"config"` + Timeout *int `json:"timeout" yaml:"timeout"` } type BatchJob struct { JobKey RuntimeBatchJobConfig - APIID string `json:"api_id"` - SQSUrl string `json:"sqs_url"` - TotalBatchCount int `json:"total_batch_count,omitempty"` - StartTime time.Time `json:"start_time,omitempty"` + APIID string `json:"api_id" yaml:"api_id"` + SQSUrl string `json:"sqs_url" yaml:"sqs_url"` + TotalBatchCount int `json:"total_batch_count,omitempty" yaml:"total_batch_count,omitempty"` + StartTime time.Time `json:"start_time,omitempty" yaml:"start_time,omitempty"` } type TaskJob struct { JobKey RuntimeTaskJobConfig - APIID string `json:"api_id"` - SpecID string `json:"spec_id"` - PodID string `json:"pod_id"` - StartTime time.Time `json:"start_time"` + APIID string `json:"api_id" yaml:"api_id"` + SpecID string `json:"spec_id" yaml:"spec_id"` + PodID string `json:"pod_id" yaml:"pod_id"` + StartTime time.Time `json:"start_time" yaml:"start_time"` } // e.g. //jobs/// diff --git a/pkg/types/status/job_status.go b/pkg/types/status/job_status.go index eb299831ba..f106d051a7 100644 --- a/pkg/types/status/job_status.go +++ b/pkg/types/status/job_status.go @@ -24,15 +24,15 @@ import ( type BatchJobStatus struct { spec.BatchJob - Status JobCode `json:"status"` - EndTime *time.Time `json:"end_time,omitempty"` - BatchesInQueue int `json:"batches_in_queue"` - WorkerCounts *WorkerCounts `json:"worker_counts,omitempty"` + Status JobCode `json:"status" yaml:"status"` + EndTime *time.Time `json:"end_time,omitempty" yaml:"end_time,omitempty"` + BatchesInQueue int `json:"batches_in_queue" yaml:"batches_in_queue"` + WorkerCounts *WorkerCounts `json:"worker_counts,omitempty" yaml:"worker_counts,omitempty"` } type TaskJobStatus struct { spec.TaskJob - EndTime *time.Time `json:"end_time"` - Status JobCode `json:"status"` - WorkerCounts *WorkerCounts `json:"worker_counts"` + EndTime *time.Time `json:"end_time,omitempty" yaml:"end_time,omitempty"` + Status JobCode `json:"status" yaml:"status"` + WorkerCounts *WorkerCounts `json:"worker_counts,omitempty" yaml:"worker_counts,omitempty"` } diff --git a/pkg/types/status/status.go b/pkg/types/status/status.go index b9b3258119..5872922b9a 100644 --- a/pkg/types/status/status.go +++ b/pkg/types/status/status.go @@ -21,10 +21,10 @@ import ( ) type Status struct { - Ready int32 `json:"ready"` - Requested int32 `json:"requested"` - UpToDate int32 `json:"up_to_date"` - ReplicaCounts *ReplicaCounts `json:"replica_counts,omitempty"` + Ready int32 `json:"ready" yaml:"ready"` + Requested int32 `json:"requested" yaml:"requested"` + UpToDate int32 `json:"up_to_date" yaml:"up_to_date"` + ReplicaCounts *ReplicaCounts `json:"replica_counts,omitempty" yaml:"replica_counts,omitempty"` } type ReplicaCountType string @@ -54,35 +54,35 @@ var ReplicaCountTypes []ReplicaCountType = []ReplicaCountType{ } type ReplicaCounts struct { - Requested int32 `json:"requested"` - Pending int32 `json:"pending"` - Creating int32 `json:"creating"` - NotReady int32 `json:"not_ready"` - Ready int32 `json:"ready"` - ReadyOutOfDate int32 `json:"ready_out_of_date"` - ErrImagePull int32 `json:"err_image_pull"` - Terminating int32 `json:"terminating"` - Failed int32 `json:"failed"` - Killed int32 `json:"killed"` - KilledOOM int32 `json:"killed_oom"` - Stalled int32 `json:"stalled"` // pending for a long time - Unknown int32 `json:"unknown"` + Requested int32 `json:"requested" yaml:"requested"` + Pending int32 `json:"pending" yaml:"pending"` + Creating int32 `json:"creating" yaml:"creating"` + NotReady int32 `json:"not_ready" yaml:"not_ready"` + Ready int32 `json:"ready" yaml:"ready"` + ReadyOutOfDate int32 `json:"ready_out_of_date" yaml:"ready_out_of_date"` + ErrImagePull int32 `json:"err_image_pull" yaml:"err_image_pull"` + Terminating int32 `json:"terminating" yaml:"terminating"` + Failed int32 `json:"failed" yaml:"failed"` + Killed int32 `json:"killed" yaml:"killed"` + KilledOOM int32 `json:"killed_oom" yaml:"killed_oom"` + Stalled int32 `json:"stalled" yaml:"stalled"` // pending for a long time + Unknown int32 `json:"unknown" yaml:"unknown"` } // Worker counts don't have as many failure variations because Jobs clean up dead pods, so counting different failure scenarios isn't interesting type WorkerCounts struct { - Pending int32 `json:"pending,omitempty"` - Creating int32 `json:"creating,omitempty"` - NotReady int32 `json:"not_ready,omitempty"` - Ready int32 `json:"ready,omitempty"` - Succeeded int32 `json:"succeeded,omitempty"` - ErrImagePull int32 `json:"err_image_pull,omitempty"` - Terminating int32 `json:"terminating,omitempty"` - Failed int32 `json:"failed,omitempty"` - Killed int32 `json:"killed,omitempty"` - KilledOOM int32 `json:"killed_oom,omitempty"` - Stalled int32 `json:"stalled,omitempty"` // pending for a long time - Unknown int32 `json:"unknown,omitempty"` + Pending int32 `json:"pending,omitempty" yaml:"pending,omitempty"` + Creating int32 `json:"creating,omitempty" yaml:"creating,omitempty"` + NotReady int32 `json:"not_ready,omitempty" yaml:"not_ready,omitempty"` + Ready int32 `json:"ready,omitempty" yaml:"ready,omitempty"` + Succeeded int32 `json:"succeeded,omitempty" yaml:"succeeded,omitempty"` + ErrImagePull int32 `json:"err_image_pull,omitempty" yaml:"err_image_pull,omitempty"` + Terminating int32 `json:"terminating,omitempty" yaml:"terminating,omitempty"` + Failed int32 `json:"failed,omitempty" yaml:"failed,omitempty"` + Killed int32 `json:"killed,omitempty" yaml:"killed,omitempty"` + KilledOOM int32 `json:"killed_oom,omitempty" yaml:"killed_oom,omitempty"` + Stalled int32 `json:"stalled,omitempty" yaml:"stalled,omitempty"` // pending for a long time + Unknown int32 `json:"unknown,omitempty" yaml:"unknown,omitempty"` } func FromDeployment(deployment *kapps.Deployment) *Status { From e2b6404cc8aaca8b40df5fe06a1c880203765fb0 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Mon, 26 Jul 2021 22:27:23 +0300 Subject: [PATCH 32/40] Fix --- pkg/operator/schema/schema.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/operator/schema/schema.go b/pkg/operator/schema/schema.go index 70809e1a07..1127d3dbf8 100644 --- a/pkg/operator/schema/schema.go +++ b/pkg/operator/schema/schema.go @@ -97,8 +97,8 @@ type ErrorResponse struct { } type APIVersion struct { - APIID string `json:"api_id"` - LastUpdated int64 `json:"last_updated"` + APIID string `json:"api_id" yaml:"api_id"` + LastUpdated int64 `json:"last_updated" yaml:"last_updated"` } type VerifyCortexResponse struct{} From f2bc8bb2de4135adc9eb5e190d05e4d31059c8a0 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Tue, 27 Jul 2021 01:10:53 +0300 Subject: [PATCH 33/40] Update docs --- dev/generate_cli_md.sh | 1 + docs/clients/cli.md | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/dev/generate_cli_md.sh b/dev/generate_cli_md.sh index 5715f6fdb8..fdf2566624 100755 --- a/dev/generate_cli_md.sh +++ b/dev/generate_cli_md.sh @@ -33,6 +33,7 @@ echo "# CLI commands" >> $out_file commands=( "deploy" "get" + "describe" "logs" "refresh" "delete" diff --git a/docs/clients/cli.md b/docs/clients/cli.md index be43886dba..b10957bfe4 100644 --- a/docs/clients/cli.md +++ b/docs/clients/cli.md @@ -32,6 +32,20 @@ Flags: -h, --help help for get ``` +## describe + +```text +describe an api + +Usage: + cortex describe [API_NAME] [flags] + +Flags: + -e, --env string environment to use + -w, --watch re-run the command every 2 seconds + -h, --help help for describe +``` + ## logs ```text From af364a65b2ccc46b409512e6ffd90730e56aa936 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Tue, 27 Jul 2021 19:20:07 +0300 Subject: [PATCH 34/40] Address PR comments --- pkg/activator/activator.go | 2 +- pkg/activator/helpers.go | 22 +------ .../batch/batchjob_controller_helpers.go | 61 ++++++++++--------- pkg/lib/k8s/pod.go | 26 +++----- pkg/types/status/status.go | 32 +++++----- pkg/types/userconfig/api.go | 14 +++++ 6 files changed, 74 insertions(+), 83 deletions(-) diff --git a/pkg/activator/activator.go b/pkg/activator/activator.go index b7c54adc3d..7b68736951 100644 --- a/pkg/activator/activator.go +++ b/pkg/activator/activator.go @@ -131,7 +131,7 @@ func (a *activator) getOrCreateAPIActivator(ctx context.Context, apiName string) return nil, errors.WithStack(err) } - maxQueueLength, maxConcurrency, err := concurrencyFromAnnotations(vs.Annotations) + maxQueueLength, maxConcurrency, err := userconfig.ConcurrencyFromAnnotations(vs) if err != nil { return nil, err } diff --git a/pkg/activator/helpers.go b/pkg/activator/helpers.go index 48790b5ac0..5bce2cb7bf 100644 --- a/pkg/activator/helpers.go +++ b/pkg/activator/helpers.go @@ -17,8 +17,6 @@ limitations under the License. package activator import ( - "strconv" - "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/types/userconfig" "k8s.io/apimachinery/pkg/api/meta" @@ -50,8 +48,7 @@ func getAPIMeta(obj interface{}) (apiMeta, error) { return apiMeta{}, errors.ErrorUnexpected("got a virtual service without apiName label") } - annotations := resource.GetAnnotations() - maxQueueLength, maxConcurrency, err := concurrencyFromAnnotations(annotations) + maxQueueLength, maxConcurrency, err := userconfig.ConcurrencyFromAnnotations(resource) if err != nil { return apiMeta{}, err } @@ -60,23 +57,8 @@ func getAPIMeta(obj interface{}) (apiMeta, error) { apiName: apiName, apiKind: userconfig.KindFromString(apiKind), labels: labels, - annotations: annotations, + annotations: resource.GetAnnotations(), maxConcurrency: maxConcurrency, maxQueueLength: maxQueueLength, }, nil } - -// TODO move this out of here -func concurrencyFromAnnotations(annotations map[string]string) (int, int, error) { - maxQueueLength, err := strconv.Atoi(annotations[userconfig.MaxQueueLengthAnnotationKey]) - if err != nil { - return 0, 0, errors.ErrorUnexpected("failed to parse annotation", userconfig.MaxQueueLengthAnnotationKey) - } - - maxConcurrency, err := strconv.Atoi(annotations[userconfig.MaxConcurrencyAnnotationKey]) - if err != nil { - return 0, 0, errors.ErrorUnexpected("failed to parse annotation", userconfig.MaxConcurrencyAnnotationKey) - } - - return maxQueueLength, maxConcurrency, err -} diff --git a/pkg/crds/controllers/batch/batchjob_controller_helpers.go b/pkg/crds/controllers/batch/batchjob_controller_helpers.go index 191d5a9b0f..0f11ba67ba 100644 --- a/pkg/crds/controllers/batch/batchjob_controller_helpers.go +++ b/pkg/crds/controllers/batch/batchjob_controller_helpers.go @@ -531,35 +531,7 @@ func (r *BatchJobReconciler) updateStatus(ctx context.Context, batchJob *batch.B batchJob.Status.Status = status.JobRunning } - // TODO move this to its own function - workerCounts := status.WorkerCounts{} - for i := range workerJobPods { - switch k8s.GetPodStatus(&workerJobPods[i]) { - case k8s.PodStatusPending: - workerCounts.Pending++ - case k8s.PodStatusStalled: - workerCounts.Stalled++ - case k8s.PodStatusCreating: - workerCounts.Creating++ - case k8s.PodStatusNotReady: - workerCounts.NotReady++ - case k8s.PodStatusErrImagePull: - workerCounts.ErrImagePull++ - case k8s.PodStatusTerminating: - workerCounts.Terminating++ - case k8s.PodStatusFailed: - workerCounts.Failed++ - case k8s.PodStatusKilled: - workerCounts.Killed++ - case k8s.PodStatusKilledOOM: - workerCounts.KilledOOM++ - case k8s.PodStatusSucceeded: - workerCounts.Succeeded++ - case k8s.PodStatusUnknown: - workerCounts.Unknown++ - } - } - + workerCounts := getReplicaCounts(workerJobPods) batchJob.Status.WorkerCounts = &workerCounts } @@ -759,3 +731,34 @@ func saveJobStatus(r *BatchJobReconciler, batchJob batch.BatchJob) error { }, ) } + +func getReplicaCounts(workerJobPods []kcore.Pod) status.WorkerCounts { + workerCounts := status.WorkerCounts{} + for i := range workerJobPods { + switch k8s.GetPodStatus(&workerJobPods[i]) { + case k8s.PodStatusPending: + workerCounts.Pending++ + case k8s.PodStatusStalled: + workerCounts.Stalled++ + case k8s.PodStatusCreating: + workerCounts.Creating++ + case k8s.PodStatusNotReady: + workerCounts.NotReady++ + case k8s.PodStatusErrImagePull: + workerCounts.ErrImagePull++ + case k8s.PodStatusTerminating: + workerCounts.Terminating++ + case k8s.PodStatusFailed: + workerCounts.Failed++ + case k8s.PodStatusKilled: + workerCounts.Killed++ + case k8s.PodStatusKilledOOM: + workerCounts.KilledOOM++ + case k8s.PodStatusSucceeded: + workerCounts.Succeeded++ + case k8s.PodStatusUnknown: + workerCounts.Unknown++ + } + } + return workerCounts +} diff --git a/pkg/lib/k8s/pod.go b/pkg/lib/k8s/pod.go index 0d3e0125bd..293e88a476 100644 --- a/pkg/lib/k8s/pod.go +++ b/pkg/lib/k8s/pod.go @@ -19,7 +19,6 @@ package k8s import ( "bytes" "context" - "fmt" "regexp" "time" @@ -67,10 +66,8 @@ const ( PodStatusKilled PodStatus = "Killed" PodStatusKilledOOM PodStatus = "KilledOOM" PodStatusStalled PodStatus = "Stalled" - - PodStatusSucceeded PodStatus = "Succeeded" - - PodStatusUnknown PodStatus = "Unknown" + PodStatusSucceeded PodStatus = "Succeeded" + PodStatusUnknown PodStatus = "Unknown" ) var ( @@ -161,11 +158,9 @@ func IsPodReady(pod *kcore.Pod) bool { return false } - // TODO use the GetPodConditionOf func here - for _, condition := range pod.Status.Conditions { - if condition.Type == kcore.PodReady && condition.Status == kcore.ConditionTrue { - return true - } + podConditionState, _ := GetPodConditionOf(pod, kcore.PodReady) + if podConditionState != nil && *podConditionState { + return true } return false @@ -176,12 +171,9 @@ func IsPodStalled(pod *kcore.Pod) bool { return false } - // TODO use the GetPodConditionOf func here - for _, condition := range pod.Status.Conditions { - if condition.Type == kcore.PodScheduled && condition.Status == kcore.ConditionFalse && !condition.LastTransitionTime.Time.IsZero() && time.Since(condition.LastTransitionTime.Time) >= _waitForCreatingPodTimeout { - fmt.Println(time.Since(condition.LastTransitionTime.Time), _waitForCreatingPodTimeout) - return true - } + podConditionState, podCondition := GetPodConditionOf(pod, kcore.PodScheduled) + if podConditionState != nil && !*podConditionState && !podCondition.LastTransitionTime.Time.IsZero() && time.Since(podCondition.LastTransitionTime.Time) >= _waitForCreatingPodTimeout { + return true } return false @@ -269,7 +261,7 @@ func GetPodStatus(pod *kcore.Pod) PodStatus { } status := PodStatusFromContainerStatuses(pod.Status.ContainerStatuses) - if status == PodStatusReady || status == PodStatusNotReady { + if status == PodStatusReady { return PodStatusNotReady } diff --git a/pkg/types/status/status.go b/pkg/types/status/status.go index 5872922b9a..15288fc8d1 100644 --- a/pkg/types/status/status.go +++ b/pkg/types/status/status.go @@ -21,28 +21,28 @@ import ( ) type Status struct { - Ready int32 `json:"ready" yaml:"ready"` - Requested int32 `json:"requested" yaml:"requested"` - UpToDate int32 `json:"up_to_date" yaml:"up_to_date"` + Ready int32 `json:"ready" yaml:"ready"` // deployment-reported number of ready replicas (latest + out of date) + Requested int32 `json:"requested" yaml:"requested"` // deployment-reported number of requested replicas + UpToDate int32 `json:"up_to_date" yaml:"up_to_date"` // deployment-reported number of up-to-date replicas (in whichever phase they are found in) ReplicaCounts *ReplicaCounts `json:"replica_counts,omitempty" yaml:"replica_counts,omitempty"` } type ReplicaCountType string const ( - ReplicaCountRequested ReplicaCountType = "Requested" - ReplicaCountPending ReplicaCountType = "Pending" - ReplicaCountCreating ReplicaCountType = "Creating" - ReplicaCountNotReady ReplicaCountType = "NotReady" - ReplicaCountReady ReplicaCountType = "Ready" - ReplicaCountReadyOutOfDate ReplicaCountType = "ReadyOutOfDate" - ReplicaCountErrImagePull ReplicaCountType = "ErrImagePull" - ReplicaCountTerminating ReplicaCountType = "Terminating" - ReplicaCountFailed ReplicaCountType = "Failed" - ReplicaCountKilled ReplicaCountType = "Killed" - ReplicaCountKilledOOM ReplicaCountType = "KilledOOM" - ReplicaCountStalled ReplicaCountType = "Stalled" - ReplicaCountUnknown ReplicaCountType = "Unknown" + ReplicaCountRequested ReplicaCountType = "Requested" // requested number of replicas (for up-to-date pods) + ReplicaCountPending ReplicaCountType = "Pending" // pods that are in the pending state (for up-to-date pods) + ReplicaCountCreating ReplicaCountType = "Creating" // pods that that have their init/non-init containers in the process of being created (for up-to-date pods) + ReplicaCountNotReady ReplicaCountType = "NotReady" // pods that are not passing the readiness checks (for up-to-date pods) + ReplicaCountReady ReplicaCountType = "Ready" // pods that are passing the readiness checks (for up-to-date pods) + ReplicaCountReadyOutOfDate ReplicaCountType = "ReadyOutOfDate" // pods that are passing the readiness checks (for out-of-date pods) + ReplicaCountErrImagePull ReplicaCountType = "ErrImagePull" // pods that couldn't pull the containers' images (for up-to-date pods) + ReplicaCountTerminating ReplicaCountType = "Terminating" // pods that are in a terminating state (for up-to-date pods) + ReplicaCountFailed ReplicaCountType = "Failed" // pods that have had their containers erroring (for up-to-date pods) + ReplicaCountKilled ReplicaCountType = "Killed" // pods that have had their container processes killed (for up-to-date pods) + ReplicaCountKilledOOM ReplicaCountType = "KilledOOM" // pods that have had their containers OOM (for up-to-date pods) + ReplicaCountStalled ReplicaCountType = "Stalled" // pods that have been in a pending state for more than 15 mins (for up-to-date pods) + ReplicaCountUnknown ReplicaCountType = "Unknown" // pods that are in an unknown state (for up-to-date pods) ) var ReplicaCountTypes []ReplicaCountType = []ReplicaCountType{ diff --git a/pkg/types/userconfig/api.go b/pkg/types/userconfig/api.go index d1e9f65f7b..a90a29e952 100644 --- a/pkg/types/userconfig/api.go +++ b/pkg/types/userconfig/api.go @@ -265,6 +265,20 @@ func EndpointFromAnnotation(k8sObj kmeta.Object) (string, error) { return endpoint, nil } +func ConcurrencyFromAnnotations(k8sObj kmeta.Object) (int, int, error) { + maxQueueLength, err := k8s.ParseIntAnnotation(k8sObj, MaxQueueLengthAnnotationKey) + if err != nil { + return 0, 0, err + } + + maxConcurrency, err := k8s.ParseIntAnnotation(k8sObj, MaxConcurrencyAnnotationKey) + if err != nil { + return 0, 0, err + } + + return maxQueueLength, maxConcurrency, nil +} + func (api *API) UserStr() string { var sb strings.Builder sb.WriteString(fmt.Sprintf("%s: %s\n", NameKey, api.Name)) From f891aa368a6c1a9d4a7987992dec55446aafe6e1 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Tue, 27 Jul 2021 20:17:06 +0300 Subject: [PATCH 35/40] Docs updates --- docs/workloads/async/statuses.md | 22 +++++++++++++++++- docs/workloads/batch/statuses.md | 2 +- docs/workloads/realtime/statuses.md | 27 ++++++++++++++-------- docs/workloads/realtime/troubleshooting.md | 6 ++--- docs/workloads/task/statuses.md | 2 +- 5 files changed, 44 insertions(+), 15 deletions(-) diff --git a/docs/workloads/async/statuses.md b/docs/workloads/async/statuses.md index 3ecaeba865..5cd8bd7cb4 100644 --- a/docs/workloads/async/statuses.md +++ b/docs/workloads/async/statuses.md @@ -1,4 +1,4 @@ -# Statuses +# Request statuses | Status | Meaning | | :--- | :--- | @@ -6,3 +6,23 @@ | in_progress | Workload has been pulled by the API and is currently being processed | | completed | Workload has completed with success | | failed | Workload encountered an error during processing | + +# Replica states + +The replica states of an API can be inspected by running the `cortex describe ` command. When run, a table is presented that shows how many replicas of the said API are found in each of the following states: + +| State | Meaning | +|:---|:---| +| Ready | Replica is running and it has passed the readiness checks | +| ReadyOutOfDate | Replica is running and it has passed the readiness checks (for an out-of-date replica) | +| NotReady | Replica is running but it's not passing the readiness checks; make sure the server is listening on the designed port of the API | +| Requested | Requested number of replicas for a given API | +| Pending | Replica is in a pending state (waiting to get scheduled onto a node) | +| Creating | Replica is in the process of having its containers created | +| ErrImagePull | Replica was not created because one of the specified Docker images was inaccessible at runtime; check that your API's docker images exist and are accessible via your cluster's AWS credentials | +| Failed | Replica couldn't start due to an error; run `cortex logs ` to view the logs | +| Killed | Replica has had one of its containers' process(es) killed | +| KilledOOM | Replica was terminated due to excessive memory usage; try allocating more memory to the API and re-deploy | +| Stalled | Replica has been in a pending state for more than 15 minutes; causes like insufficient memory, CPU, GPU or Inf could be culprit; could also be that the node selector on the API is out-of-date | +| Terminating | Replica is currently in the process of being terminated | +| Unknown | Replica is in an undefined state; should not be possible | diff --git a/docs/workloads/batch/statuses.md b/docs/workloads/batch/statuses.md index 1bcddcd6bd..019ca55789 100644 --- a/docs/workloads/batch/statuses.md +++ b/docs/workloads/batch/statuses.md @@ -1,4 +1,4 @@ -# Statuses +# Job statuses | Status | Meaning | | :--- | :--- | diff --git a/docs/workloads/realtime/statuses.md b/docs/workloads/realtime/statuses.md index 2ee32aca40..6decef16f0 100644 --- a/docs/workloads/realtime/statuses.md +++ b/docs/workloads/realtime/statuses.md @@ -1,10 +1,19 @@ -# Statuses +# Replica states -| Status | Meaning | -| :--- | :--- | -| live | API is deployed and ready to serve requests (at least one replica is running) | -| updating | API is updating | -| error | API was not created due to an error; run `cortex logs ` to view the logs | -| error (image pull) | API was not created because one of the specified Docker images was inaccessible at runtime; check that your API's docker images exist and are accessible via your cluster's AWS credentials | -| error (out of memory) | API was terminated due to excessive memory usage; try allocating more memory to the API and re-deploying | -| compute unavailable | API could not start due to insufficient memory, CPU, GPU, or Inf in the cluster; some replicas may be ready | +The replica states of an API can be inspected by running the `cortex describe ` command. When run, a table is presented that shows how many replicas of the said API are found in each of the following states: + +| State | Meaning | +|:---|:---| +| Ready | Replica is running and it has passed the readiness checks | +| ReadyOutOfDate | Replica is running and it has passed the readiness checks (for an out-of-date replica) | +| NotReady | Replica is running but it's not passing the readiness checks; make sure the server is listening on the designed port of the API | +| Requested | Requested number of replicas for a given API | +| Pending | Replica is in a pending state (waiting to get scheduled onto a node) | +| Creating | Replica is in the process of having its containers created | +| ErrImagePull | Replica was not created because one of the specified Docker images was inaccessible at runtime; check that your API's docker images exist and are accessible via your cluster's AWS credentials | +| Failed | Replica couldn't start due to an error; run `cortex logs ` to view the logs | +| Killed | Replica has had one of its containers' process(es) killed | +| KilledOOM | Replica was terminated due to excessive memory usage; try allocating more memory to the API and re-deploy | +| Stalled | Replica has been in a pending state for more than 15 minutes; causes like insufficient memory, CPU, GPU or Inf could be culprit; could also be that the node selector on the API is out-of-date | +| Terminating | Replica is currently in the process of being terminated | +| Unknown | Replica is in an undefined state; should not be possible | diff --git a/docs/workloads/realtime/troubleshooting.md b/docs/workloads/realtime/troubleshooting.md index 61de9dfe74..5254d25aaa 100644 --- a/docs/workloads/realtime/troubleshooting.md +++ b/docs/workloads/realtime/troubleshooting.md @@ -4,14 +4,14 @@ When making requests to your API, it's possible to get a `no healthy upstream` error message (with HTTP status code `503`). This means that there are currently no live replicas running for your API. This could happen for a few reasons: -1. It's possible that your API is simply not ready yet. You can check the status of your API with `cortex get API_NAME`, and inspect the logs in CloudWatch with the help of `cortex logs API_NAME`. -1. Your API may have errored during initialization or while responding to a previous request. `cortex get API_NAME` will show the status of your API, and you can view the logs for all replicas by visiting the CloudWatch Insights URL from `cortex logs API_NAME`. +1. It's possible that your API is simply not ready yet. You can check the number of ready replicas on your API with `cortex get API_NAME`, and inspect the logs in CloudWatch with the help of `cortex logs API_NAME`. +1. Your API may have errored during initialization or while responding to a previous request. `cortex describe API_NAME` will show the number of replicas that have failed to start on your API, and you can view the logs for all replicas by visiting the CloudWatch Insights URL from `cortex logs API_NAME`. If you are using API Gateway in front of your API endpoints, it is also possible to receive a `{"message":"Service Unavailable"}` error message (with HTTP status code `503`) after 29 seconds if your request exceeds API Gateway's 29 second timeout. If this is the case, you can either modify your code to take less time, run on faster hardware (e.g. GPUs), or don't use API Gateway (there is no timeout when using the API's endpoint directly). ## API is stuck updating -If your API is stuck in the "updating" or "compute unavailable" state (which is displayed when running `cortex get`), there are a few possible causes. Here are some things to check: +If your API has pods stuck in the "pending" or "stalled" states (which is displayed when running `cortex describe API_NAME`), there are a few possible causes. Here are some things to check: ### Inspect API logs in CloudWatch diff --git a/docs/workloads/task/statuses.md b/docs/workloads/task/statuses.md index b51eaf010f..0631ab68f2 100644 --- a/docs/workloads/task/statuses.md +++ b/docs/workloads/task/statuses.md @@ -1,4 +1,4 @@ -# Statuses +# Job statuses | Status | Meaning | | :--- | :--- | From 121a66942714a42e4be507fe012cda9ec8b73679 Mon Sep 17 00:00:00 2001 From: David Eliahu Date: Tue, 27 Jul 2021 10:26:35 -0700 Subject: [PATCH 36/40] Update statuses.md --- docs/workloads/realtime/statuses.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/workloads/realtime/statuses.md b/docs/workloads/realtime/statuses.md index 6decef16f0..a6a12cf225 100644 --- a/docs/workloads/realtime/statuses.md +++ b/docs/workloads/realtime/statuses.md @@ -16,4 +16,4 @@ The replica states of an API can be inspected by running the `cortex describe Date: Tue, 27 Jul 2021 10:27:05 -0700 Subject: [PATCH 37/40] Update statuses.md --- docs/workloads/async/statuses.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/workloads/async/statuses.md b/docs/workloads/async/statuses.md index 5cd8bd7cb4..807f97e568 100644 --- a/docs/workloads/async/statuses.md +++ b/docs/workloads/async/statuses.md @@ -25,4 +25,4 @@ The replica states of an API can be inspected by running the `cortex describe Date: Tue, 27 Jul 2021 11:07:27 -0700 Subject: [PATCH 38/40] Update statuses.md --- docs/workloads/async/statuses.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/workloads/async/statuses.md b/docs/workloads/async/statuses.md index 807f97e568..9c4787f293 100644 --- a/docs/workloads/async/statuses.md +++ b/docs/workloads/async/statuses.md @@ -9,20 +9,19 @@ # Replica states -The replica states of an API can be inspected by running the `cortex describe ` command. When run, a table is presented that shows how many replicas of the said API are found in each of the following states: +The replica states of an API can be inspected by running `cortex describe `. Here are the possible states for each replica in an API: | State | Meaning | |:---|:---| | Ready | Replica is running and it has passed the readiness checks | | ReadyOutOfDate | Replica is running and it has passed the readiness checks (for an out-of-date replica) | | NotReady | Replica is running but it's not passing the readiness checks; make sure the server is listening on the designed port of the API | -| Requested | Requested number of replicas for a given API | | Pending | Replica is in a pending state (waiting to get scheduled onto a node) | | Creating | Replica is in the process of having its containers created | | ErrImagePull | Replica was not created because one of the specified Docker images was inaccessible at runtime; check that your API's docker images exist and are accessible via your cluster's AWS credentials | | Failed | Replica couldn't start due to an error; run `cortex logs ` to view the logs | -| Killed | Replica has had one of its containers' process(es) killed | +| Killed | Replica has had one of its containers killed | | KilledOOM | Replica was terminated due to excessive memory usage; try allocating more memory to the API and re-deploy | -| Stalled | Replica has been in a pending state for more than 15 minutes; causes like insufficient memory, CPU, GPU or Inf could be culprit; could also be that the node selector on the API is out-of-date | +| Stalled | Replica has been in a pending state for more than 15 minutes; see [troubleshooting](../realtime/troubleshooting.md) | | Terminating | Replica is currently in the process of being terminated | | Unknown | Replica is in an unknown state | From 4e0dc14d2d87dcc18975ac8df612f8a3710a9b42 Mon Sep 17 00:00:00 2001 From: David Eliahu Date: Tue, 27 Jul 2021 11:07:57 -0700 Subject: [PATCH 39/40] Update statuses.md --- docs/workloads/realtime/statuses.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/workloads/realtime/statuses.md b/docs/workloads/realtime/statuses.md index a6a12cf225..d4e201bfba 100644 --- a/docs/workloads/realtime/statuses.md +++ b/docs/workloads/realtime/statuses.md @@ -1,19 +1,18 @@ # Replica states -The replica states of an API can be inspected by running the `cortex describe ` command. When run, a table is presented that shows how many replicas of the said API are found in each of the following states: +The replica states of an API can be inspected by running `cortex describe `. Here are the possible states for each replica in an API: | State | Meaning | |:---|:---| | Ready | Replica is running and it has passed the readiness checks | | ReadyOutOfDate | Replica is running and it has passed the readiness checks (for an out-of-date replica) | | NotReady | Replica is running but it's not passing the readiness checks; make sure the server is listening on the designed port of the API | -| Requested | Requested number of replicas for a given API | | Pending | Replica is in a pending state (waiting to get scheduled onto a node) | | Creating | Replica is in the process of having its containers created | | ErrImagePull | Replica was not created because one of the specified Docker images was inaccessible at runtime; check that your API's docker images exist and are accessible via your cluster's AWS credentials | | Failed | Replica couldn't start due to an error; run `cortex logs ` to view the logs | -| Killed | Replica has had one of its containers' process(es) killed | +| Killed | Replica has had one of its containers killed | | KilledOOM | Replica was terminated due to excessive memory usage; try allocating more memory to the API and re-deploy | -| Stalled | Replica has been in a pending state for more than 15 minutes; causes like insufficient memory, CPU, GPU or Inf could be culprit; could also be that the node selector on the API is out-of-date | +| Stalled | Replica has been in a pending state for more than 15 minutes; see [troubleshooting](../realtime/troubleshooting.md) | | Terminating | Replica is currently in the process of being terminated | | Unknown | Replica is in an unknown state | From ce6a89f19be1603bda055b2cc5600848e7934d58 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Tue, 27 Jul 2021 22:06:16 +0300 Subject: [PATCH 40/40] Address PR comments --- cli/cmd/lib_traffic_splitters.go | 4 ++-- pkg/operator/resources/asyncapi/status.go | 2 +- pkg/operator/resources/realtimeapi/status.go | 2 +- pkg/operator/resources/trafficsplitter/api.go | 20 +++++++++---------- pkg/operator/schema/schema.go | 17 ++++++++-------- pkg/types/status/status.go | 2 +- pkg/types/userconfig/api.go | 4 ++-- pkg/types/userconfig/config_key.go | 2 +- 8 files changed, 26 insertions(+), 27 deletions(-) diff --git a/cli/cmd/lib_traffic_splitters.go b/cli/cmd/lib_traffic_splitters.go index 8eaf6b048b..af2b4e4aad 100644 --- a/cli/cmd/lib_traffic_splitters.go +++ b/cli/cmd/lib_traffic_splitters.go @@ -110,14 +110,14 @@ func trafficSplitTable(trafficSplitter schema.APIResponse, env cliconfig.Environ func trafficSplitterListTable(trafficSplitter []schema.APIResponse, envNames []string) table.Table { rows := make([][]interface{}, 0, len(trafficSplitter)) for i, splitAPI := range trafficSplitter { - if splitAPI.Metadata == nil || splitAPI.Status == nil { + if splitAPI.Metadata == nil || splitAPI.NumTrafficSplitterTargets == nil { continue } lastUpdated := time.Unix(splitAPI.Metadata.LastUpdated, 0) rows = append(rows, []interface{}{ envNames[i], splitAPI.Metadata.Name, - s.Int32(splitAPI.Status.Ready), + s.Int32(*splitAPI.NumTrafficSplitterTargets), libtime.SinceStr(&lastUpdated), }) } diff --git a/pkg/operator/resources/asyncapi/status.go b/pkg/operator/resources/asyncapi/status.go index 69977c731e..7035c31c01 100644 --- a/pkg/operator/resources/asyncapi/status.go +++ b/pkg/operator/resources/asyncapi/status.go @@ -73,7 +73,7 @@ func addPodToReplicaCounts(pod *kcore.Pod, deployment *kapps.Deployment, counts case k8s.PodStatusCreating: counts.Creating++ case k8s.PodStatusReady: - counts.Creating++ + counts.Ready++ case k8s.PodStatusNotReady: counts.NotReady++ case k8s.PodStatusErrImagePull: diff --git a/pkg/operator/resources/realtimeapi/status.go b/pkg/operator/resources/realtimeapi/status.go index 0a88b83d17..a90c42f387 100644 --- a/pkg/operator/resources/realtimeapi/status.go +++ b/pkg/operator/resources/realtimeapi/status.go @@ -72,7 +72,7 @@ func addPodToReplicaCounts(pod *kcore.Pod, deployment *kapps.Deployment, counts case k8s.PodStatusCreating: counts.Creating++ case k8s.PodStatusReady: - counts.Creating++ + counts.Ready++ case k8s.PodStatusNotReady: counts.NotReady++ case k8s.PodStatusErrImagePull: diff --git a/pkg/operator/resources/trafficsplitter/api.go b/pkg/operator/resources/trafficsplitter/api.go index 03c89ea4a1..4881f724e3 100644 --- a/pkg/operator/resources/trafficsplitter/api.go +++ b/pkg/operator/resources/trafficsplitter/api.go @@ -26,11 +26,11 @@ import ( "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/k8s" "github.com/cortexlabs/cortex/pkg/lib/parallel" + "github.com/cortexlabs/cortex/pkg/lib/pointer" "github.com/cortexlabs/cortex/pkg/operator/lib/routines" "github.com/cortexlabs/cortex/pkg/operator/operator" "github.com/cortexlabs/cortex/pkg/operator/schema" "github.com/cortexlabs/cortex/pkg/types/spec" - "github.com/cortexlabs/cortex/pkg/types/status" "github.com/cortexlabs/cortex/pkg/types/userconfig" "github.com/cortexlabs/cortex/pkg/workloads" istioclientnetworking "istio.io/client-go/pkg/apis/networking/v1beta1" @@ -142,21 +142,19 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService) ([]schem return nil, errors.Wrap(err, fmt.Sprintf("api %s", apiName)) } + if metadata.Kind != userconfig.TrafficSplitterKind { + continue + } + targets, err := userconfig.TrafficSplitterTargetsFromAnnotations(&virtualServices[i]) if err != nil { return nil, errors.Wrap(err, fmt.Sprintf("api %s", apiName)) } - if metadata.Kind == userconfig.TrafficSplitterKind { - trafficSplitters = append(trafficSplitters, schema.APIResponse{ - Metadata: metadata, - Status: &status.Status{ - Ready: targets, - Requested: targets, - UpToDate: targets, - }, - }) - } + trafficSplitters = append(trafficSplitters, schema.APIResponse{ + Metadata: metadata, + NumTrafficSplitterTargets: pointer.Int32(targets), + }) } return trafficSplitters, nil diff --git a/pkg/operator/schema/schema.go b/pkg/operator/schema/schema.go index 1127d3dbf8..1ee895cace 100644 --- a/pkg/operator/schema/schema.go +++ b/pkg/operator/schema/schema.go @@ -56,14 +56,15 @@ type DeployResult struct { } type APIResponse struct { - Spec *spec.API `json:"spec,omitempty" yaml:"spec,omitempty"` - Metadata *spec.Metadata `json:"metadata,omitempty" yaml:"metadata,omitempty"` - Status *status.Status `json:"status,omitempty" yaml:"status,omitempty"` - Endpoint *string `json:"endpoint,omitempty" yaml:"endpoint,omitempty"` - DashboardURL *string `json:"dashboard_url,omitempty" yaml:"dashboard_url,omitempty"` - BatchJobStatuses []status.BatchJobStatus `json:"batch_job_statuses,omitempty" yaml:"batch_job_statuses,omitempty"` - TaskJobStatuses []status.TaskJobStatus `json:"task_job_statuses,omitempty" yaml:"task_job_statuses,omitempty"` - APIVersions []APIVersion `json:"api_versions,omitempty" yaml:"api_versions,omitempty"` + Spec *spec.API `json:"spec,omitempty" yaml:"spec,omitempty"` + Metadata *spec.Metadata `json:"metadata,omitempty" yaml:"metadata,omitempty"` + Status *status.Status `json:"status,omitempty" yaml:"status,omitempty"` + NumTrafficSplitterTargets *int32 `json:"num_traffic_splitter_targets,omitempty" yaml:"num_traffic_splitter_targets,omitempty"` + Endpoint *string `json:"endpoint,omitempty" yaml:"endpoint,omitempty"` + DashboardURL *string `json:"dashboard_url,omitempty" yaml:"dashboard_url,omitempty"` + BatchJobStatuses []status.BatchJobStatus `json:"batch_job_statuses,omitempty" yaml:"batch_job_statuses,omitempty"` + TaskJobStatuses []status.TaskJobStatus `json:"task_job_statuses,omitempty" yaml:"task_job_statuses,omitempty"` + APIVersions []APIVersion `json:"api_versions,omitempty" yaml:"api_versions,omitempty"` } type LogResponse struct { diff --git a/pkg/types/status/status.go b/pkg/types/status/status.go index 15288fc8d1..e0de4943ef 100644 --- a/pkg/types/status/status.go +++ b/pkg/types/status/status.go @@ -61,7 +61,7 @@ type ReplicaCounts struct { Ready int32 `json:"ready" yaml:"ready"` ReadyOutOfDate int32 `json:"ready_out_of_date" yaml:"ready_out_of_date"` ErrImagePull int32 `json:"err_image_pull" yaml:"err_image_pull"` - Terminating int32 `json:"terminating" yaml:"terminating"` + Terminating int32 `json:"terminating" yaml:"terminating"` // includes up-to-date and out-of-date pods Failed int32 `json:"failed" yaml:"failed"` Killed int32 `json:"killed" yaml:"killed"` KilledOOM int32 `json:"killed_oom" yaml:"killed_oom"` diff --git a/pkg/types/userconfig/api.go b/pkg/types/userconfig/api.go index a90a29e952..c524c599e0 100644 --- a/pkg/types/userconfig/api.go +++ b/pkg/types/userconfig/api.go @@ -156,7 +156,7 @@ func (api *API) ToK8sAnnotations() map[string]string { annotations := map[string]string{} if len(api.APIs) > 0 { - annotations[NumberOfTrafficSplitterTargets] = s.Int32(int32(len(api.APIs))) + annotations[NumTrafficSplitterTargetsAnnotationKey] = s.Int32(int32(len(api.APIs))) } if api.Pod != nil && api.Kind == RealtimeAPIKind { @@ -250,7 +250,7 @@ func AutoscalingFromAnnotations(k8sObj kmeta.Object) (*Autoscaling, error) { } func TrafficSplitterTargetsFromAnnotations(k8sObj kmeta.Object) (int32, error) { - targets, err := k8s.ParseInt32Annotation(k8sObj, NumberOfTrafficSplitterTargets) + targets, err := k8s.ParseInt32Annotation(k8sObj, NumTrafficSplitterTargetsAnnotationKey) if err != nil { return 0, err } diff --git a/pkg/types/userconfig/config_key.go b/pkg/types/userconfig/config_key.go index 263f764bd6..5cbe3b2dda 100644 --- a/pkg/types/userconfig/config_key.go +++ b/pkg/types/userconfig/config_key.go @@ -91,7 +91,7 @@ const ( EndpointAnnotationKey = "networking.cortex.dev/endpoint" MaxConcurrencyAnnotationKey = "pod.cortex.dev/max-concurrency" MaxQueueLengthAnnotationKey = "pod.cortex.dev/max-queue-length" - NumberOfTrafficSplitterTargets = "apis.cortex.dev/traffic-splitter-targets" + NumTrafficSplitterTargetsAnnotationKey = "apis.cortex.dev/traffic-splitter-targets" MinReplicasAnnotationKey = "autoscaling.cortex.dev/min-replicas" MaxReplicasAnnotationKey = "autoscaling.cortex.dev/max-replicas" TargetInFlightAnnotationKey = "autoscaling.cortex.dev/target-in-flight"