Skip to content

avoid preempt of failed co-schd pods #317

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 7, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 36 additions & 32 deletions pkg/controller/queuejobresources/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ func PendingPodsFailedSchd(pods []*v1.Pod) map[string][]v1.PodCondition {
if strings.Contains(cond.Message, "pgName") && strings.Contains(cond.Message, "last") && strings.Contains(cond.Message, "failed") && strings.Contains(cond.Message, "deny") {
//ignore co-scheduled pending pods for coscheduler version:0.22.6
continue
} else if strings.Contains(cond.Message, "optimistic") && strings.Contains(cond.Message, "rejection") && strings.Contains(cond.Message, "PostFilter") ||
strings.Contains(cond.Message, "cannot") && strings.Contains(cond.Message, "find") && strings.Contains(cond.Message, "enough") && strings.Contains(cond.Message, "sibling") {
//ignore co-scheduled pending pods for coscheduler version:0.23.10
continue
} else {
podName := string(pods[i].Name)
podCondition[podName] = append(podCondition[podName], *cond.DeepCopy())
Expand All @@ -74,41 +78,41 @@ func PendingPodsFailedSchd(pods []*v1.Pod) map[string][]v1.PodCondition {

// filterPods returns pods based on their phase.
func GetPodResourcesByPhase(phase v1.PodPhase, pods []*v1.Pod) *clusterstateapi.Resource {
req := clusterstateapi.EmptyResource()
for i := range pods {
if pods[i].Status.Phase == phase {
for _, c := range pods[i].Spec.Containers {
req.Add(clusterstateapi.NewResource(c.Resources.Requests))
}
}
}
return req
req := clusterstateapi.EmptyResource()
for i := range pods {
if pods[i].Status.Phase == phase {
for _, c := range pods[i].Spec.Containers {
req.Add(clusterstateapi.NewResource(c.Resources.Requests))
}
}
}
return req
}

func GetPodResources(template *v1.PodTemplateSpec) *clusterstateapi.Resource {
total := clusterstateapi.EmptyResource()
req := clusterstateapi.EmptyResource()
limit := clusterstateapi.EmptyResource()
spec := template.Spec
total := clusterstateapi.EmptyResource()
req := clusterstateapi.EmptyResource()
limit := clusterstateapi.EmptyResource()
spec := template.Spec

if &spec == nil {
klog.Errorf("Pod Spec not found in Pod Template: %+v. Aggregated resources set to 0.", template)
return total
}
if &spec == nil {
klog.Errorf("Pod Spec not found in Pod Template: %+v. Aggregated resources set to 0.", template)
return total
}

for _, c := range template.Spec.Containers {
req.Add(clusterstateapi.NewResource(c.Resources.Requests))
limit.Add(clusterstateapi.NewResource(c.Resources.Limits))
}
if req.MilliCPU < limit.MilliCPU {
req.MilliCPU = limit.MilliCPU
}
if req.Memory < limit.Memory {
req.Memory = limit.Memory
}
if req.GPU < limit.GPU {
req.GPU = limit.GPU
}
total = total.Add(req)
return total
for _, c := range template.Spec.Containers {
req.Add(clusterstateapi.NewResource(c.Resources.Requests))
limit.Add(clusterstateapi.NewResource(c.Resources.Limits))
}
if req.MilliCPU < limit.MilliCPU {
req.MilliCPU = limit.MilliCPU
}
if req.Memory < limit.Memory {
req.Memory = limit.Memory
}
if req.GPU < limit.GPU {
req.GPU = limit.GPU
}
total = total.Add(req)
return total
}