From 332739532f32e34105bc30a6b4652936e6ad1242 Mon Sep 17 00:00:00 2001 From: Joseph Kleinhenz Date: Wed, 15 Jan 2025 19:15:49 -0800 Subject: [PATCH 1/7] switch to updated ghcr image --- ci/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/Dockerfile b/ci/Dockerfile index 94750e7..963494e 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -1,7 +1,7 @@ -FROM jkleinh/slurm-cluster@sha256:afd20dafc831b0fa781460dc871232579ccf1b54955e434531394c331ce388e4 as base +FROM ghcr.io/kleinhenz/docker-slurm-cluster@sha256:c5569678dbe8cc94db60d1e2f520a0ed9c19ea1d4085f761b4b4a447d3bccca1 as base MAINTAINER Joseph Kleinhenz -ARG JULIA_VERSION=1.6.0 +ARG JULIA_VERSION=1.11.2 RUN mkdir -p /home/docker/.local/opt/julia \ && cd /home/docker/.local/opt/julia \ From 9966ed9cee4c12dedb4c054262321113db758ffd Mon Sep 17 00:00:00 2001 From: Joseph Kleinhenz Date: Wed, 15 Jan 2025 19:35:48 -0800 Subject: [PATCH 2/7] update image --- ci/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/Dockerfile b/ci/Dockerfile index 963494e..3e19317 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -1,4 +1,4 @@ -FROM ghcr.io/kleinhenz/docker-slurm-cluster@sha256:c5569678dbe8cc94db60d1e2f520a0ed9c19ea1d4085f761b4b4a447d3bccca1 as base +FROM ghcr.io/kleinhenz/docker-slurm-cluster@sha256:ed26ab967e84c955f75499d50ef7a4f844e48d7812de5ef6b64423728d9d10c2 as base MAINTAINER Joseph Kleinhenz ARG JULIA_VERSION=1.11.2 From 004ce7a1f20285ebb024b1c48873050ef0a66ffb Mon Sep 17 00:00:00 2001 From: Joseph Kleinhenz Date: Wed, 15 Jan 2025 20:32:56 -0800 Subject: [PATCH 3/7] use scontrol instead of sacct --- test/runtests.jl | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index f03d53d..636cdcf 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -15,13 +15,16 @@ end println("jobid = $jobid") # get job state from jobid -getjobstate = jobid -> read(`sacct -j $jobid --format=state --noheader`, String) +getjobstate = jobid -> begin + info = read(`scontrol show $jobid`, String) + state = match(r"JobState=(\S*)", info) + return state +end # wait for job to complete status = timedwait(60.0, pollint=1.0) do state = getjobstate(jobid) - state == "" && return false - state = first(split(state)) # don't care about jobsteps + state == nothing && return false println("jobstate = $state") return state == "COMPLETED" || state == "FAILED" end From 37229c2b8eb310dd3316f928d98e1f6945fbb9a3 Mon Sep 17 00:00:00 2001 From: Joseph Kleinhenz Date: Wed, 15 Jan 2025 20:35:28 -0800 Subject: [PATCH 4/7] fix scontrol command --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 636cdcf..507a578 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -16,7 +16,7 @@ println("jobid = $jobid") # get job state from jobid getjobstate = jobid -> begin - info = read(`scontrol show $jobid`, String) + info = read(`scontrol show jobid=$jobid`, String) state = match(r"JobState=(\S*)", info) return state end From 715f683b18f532a952ae1516f66d314b52357780 Mon Sep 17 00:00:00 2001 From: Joseph Kleinhenz Date: Wed, 15 Jan 2025 20:40:00 -0800 Subject: [PATCH 5/7] handle non zero exit status --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 507a578..817a8f0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -16,7 +16,7 @@ println("jobid = $jobid") # get job state from jobid getjobstate = jobid -> begin - info = read(`scontrol show jobid=$jobid`, String) + info = read(Cmd(`scontrol show jobid=$jobid`, ignorestatus=true), String) state = match(r"JobState=(\S*)", info) return state end From 08d6b9f5992f7af955634243ae96a400855327d1 Mon Sep 17 00:00:00 2001 From: Joseph Kleinhenz Date: Wed, 15 Jan 2025 21:33:28 -0800 Subject: [PATCH 6/7] fixes --- ci/Dockerfile | 3 +-- ci/docker-compose.yml | 5 +++-- test/runtests.jl | 17 +++++++---------- test/script.jl | 1 + 4 files changed, 12 insertions(+), 14 deletions(-) diff --git a/ci/Dockerfile b/ci/Dockerfile index 3e19317..25a3586 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -1,5 +1,4 @@ -FROM ghcr.io/kleinhenz/docker-slurm-cluster@sha256:ed26ab967e84c955f75499d50ef7a4f844e48d7812de5ef6b64423728d9d10c2 as base -MAINTAINER Joseph Kleinhenz +FROM ghcr.io/kleinhenz/docker-slurm-cluster@sha256:c62b169970eaab879898a7df5950f888b0417147e06f8c0e32fd15087b22d9f2 ARG JULIA_VERSION=1.11.2 diff --git a/ci/docker-compose.yml b/ci/docker-compose.yml index e9b60e5..bb06d41 100644 --- a/ci/docker-compose.yml +++ b/ci/docker-compose.yml @@ -1,8 +1,9 @@ -version: "3.3" - services: slurmctld: image: slurm-cluster-julia + build: + context: .. + dockerfile: ci/Dockerfile command: ["slurmctld"] container_name: slurmctld hostname: slurmctld diff --git a/test/runtests.jl b/test/runtests.jl index 817a8f0..0a8ee6b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -10,15 +10,16 @@ using Distributed, Test, SlurmClusterManager project_path = abspath(joinpath(@__DIR__, "..")) println("project_path = $project_path") jobid = withenv("JULIA_PROJECT"=>project_path) do - read(`sbatch --export=ALL --parsable -n 4 -o test.out script.jl`, String) + strip(read(`sbatch --export=ALL --parsable -n 4 -o test.out script.jl`, String)) end println("jobid = $jobid") # get job state from jobid getjobstate = jobid -> begin - info = read(Cmd(`scontrol show jobid=$jobid`, ignorestatus=true), String) + cmd = Cmd(`scontrol show jobid=$jobid`, ignorestatus=true) + info = read(cmd, String) state = match(r"JobState=(\S*)", info) - return state + return isnothing(state) ? nothing : state.captures[1] end # wait for job to complete @@ -29,17 +30,13 @@ status = timedwait(60.0, pollint=1.0) do return state == "COMPLETED" || state == "FAILED" end +state = getjobstate(jobid) + # check that job finished running within timelimit (either completed or failed) @test status == :ok +@test state == "COMPLETED" # print job output output = read("test.out", String) println("script output:") println(output) - -state = getjobstate(jobid) |> split -# length should be two because creating the workers creates a job step -@test length(state) == 2 - -# check that everything exited without errors -@test all(state .== "COMPLETED") diff --git a/test/script.jl b/test/script.jl index 060760a..1da6c49 100644 --- a/test/script.jl +++ b/test/script.jl @@ -9,5 +9,6 @@ hosts = map(workers()) do id remotecall_fetch(() -> gethostname(), id) end sort!(hosts) +println(hosts) @assert hosts == ["c1", "c1", "c2", "c2"] From 65e006a850daec456a6131ba1ac4ca76842d3d59 Mon Sep 17 00:00:00 2001 From: Joseph Kleinhenz Date: Wed, 15 Jan 2025 21:54:17 -0800 Subject: [PATCH 7/7] don't use isnothing --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 0a8ee6b..307f1fe 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -19,7 +19,7 @@ getjobstate = jobid -> begin cmd = Cmd(`scontrol show jobid=$jobid`, ignorestatus=true) info = read(cmd, String) state = match(r"JobState=(\S*)", info) - return isnothing(state) ? nothing : state.captures[1] + return state === nothing ? nothing : state.captures[1] end # wait for job to complete