From dd695c0b06de6bd32e6317f986802179249aa705 Mon Sep 17 00:00:00 2001 From: jbphyswx Date: Mon, 9 Dec 2024 02:12:01 -0800 Subject: [PATCH] Workers inherit environment by default, fix tests --- .gitignore | 31 +++++++++ src/slurmmanager.jl | 39 ++++++++++- test/runtests.jl | 20 +++++- test/runtests_helpers.jl | 35 ++++++++++ test/script.jl | 4 +- test/test_inheritance/Project.toml | 4 ++ test/test_inheritance/runtests_inheritance.jl | 46 +++++++++++++ test/test_inheritance/script_inheritance.jl | 66 +++++++++++++++++++ 8 files changed, 239 insertions(+), 6 deletions(-) create mode 100644 .gitignore create mode 100644 test/runtests_helpers.jl create mode 100644 test/test_inheritance/Project.toml create mode 100644 test/test_inheritance/runtests_inheritance.jl create mode 100644 test/test_inheritance/script_inheritance.jl diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0242a9b --- /dev/null +++ b/.gitignore @@ -0,0 +1,31 @@ +# Files generated by invoking Julia with --code-coverage +*.jl.cov +*.jl.*.cov + +# Files generated by invoking Julia with --track-allocation +*.jl.mem + +# System-specific files and directories generated by the BinaryProvider and BinDeps packages +# They contain absolute paths specific to the host computer, and so should not be committed +deps/deps.jl +deps/build.log +deps/downloads/ +deps/usr/ +deps/src/ + +# Build artifacts for creating documentation generated by the Documenter package +docs/build/ +docs/site/ + +# File generated by Pkg, the package manager, based on a corresponding Project.toml +# It records a fixed state of all packages used by the project. As such, it should not be +# committed for packages, but should be committed for applications that require a static +# environment. +Manifest.toml + +# editor settings +.vscode +LocalPreferences.toml + +# slurm logs +*.out \ No newline at end of file diff --git a/src/slurmmanager.jl b/src/slurmmanager.jl index a01ad4f..69c9712 100644 --- a/src/slurmmanager.jl +++ b/src/slurmmanager.jl @@ -43,21 +43,54 @@ mutable struct SlurmManager <: ClusterManager end end +""" + By default, workers inherit the environment variables from the master process [implmentation adapted directly from https://github.com/JuliaLang/julia/pull/43270/files] +""" function launch(manager::SlurmManager, params::Dict, instances_arr::Array, c::Condition) try + dir = params[:dir] exehome = params[:dir] exename = params[:exename] exeflags = params[:exeflags] + # TODO: Maybe this belongs in base/initdefs.jl as a package_environment() function + # together with load_path() etc. Might be useful to have when spawning julia + # processes outside of Distributed.jl too. + # JULIA_(LOAD|DEPOT)_PATH are used to populate (LOAD|DEPOT)_PATH on startup, + # but since (LOAD|DEPOT)_PATH might have changed they are re-serialized here. + # Users can opt-out of this by passing `env = ...` to addprocs(...). + env = Dict{String,String}(params[:env]) + pathsep = Sys.iswindows() ? ";" : ":" + if get(env, "JULIA_LOAD_PATH", nothing) === nothing + env["JULIA_LOAD_PATH"] = join(LOAD_PATH, pathsep) + end + if get(env, "JULIA_DEPOT_PATH", nothing) === nothing + env["JULIA_DEPOT_PATH"] = join(DEPOT_PATH, pathsep) + end + + # is this necessary? + if !params[:enable_threaded_blas] && + get(env, "OPENBLAS_NUM_THREADS", nothing) === nothing + env["OPENBLAS_NUM_THREADS"] = "1" + end + + # Set the active project on workers using JULIA_PROJECT. + # Users can opt-out of this by (i) passing `env = ...` or (ii) passing + # `--project=...` as `exeflags` to addprocs(...). + project = Base.ACTIVE_PROJECT[] + if project !== nothing && get(env, "JULIA_PROJECT", nothing) === nothing + env["JULIA_PROJECT"] = project + end + # Pass cookie as stdin to srun; srun forwards stdin to process # This way the cookie won't be visible in ps, top, etc on the compute node srun_cmd = `srun -D $exehome $exename $exeflags --worker` - manager.srun_proc = open(srun_cmd, write=true, read=true) + manager.srun_proc = open(setenv(addenv(srun_cmd, env), dir=dir), write=true, read=true) write(manager.srun_proc, cluster_cookie()) write(manager.srun_proc, "\n") t = @async for i in 1:manager.ntasks - manager.verbose && println("connecting to worker $i out of $(manager.ntasks)") + manager.verbose && @info "connecting to worker $i out of $(manager.ntasks)" line = readline(manager.srun_proc) m = match(r".*:(\d*)#(.*)", line) @@ -67,6 +100,8 @@ function launch(manager::SlurmManager, params::Dict, instances_arr::Array, c::Co config.port = parse(Int, m[1]) config.host = strip(m[2]) + manager.verbose && @info "Worker $i ready on host $(config.host), port $(config.port)" + push!(instances_arr, config) notify(c) end diff --git a/test/runtests.jl b/test/runtests.jl index f03d53d..5c2940f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -5,12 +5,25 @@ using Distributed, Test, SlurmClusterManager # test that slurm is available @test !(Sys.which("sinfo") === nothing) +# NOTE: If in an interactive allocation (not sure how to check and avoid), the sbatch command below will fail on recent slurm versions. See [https://support.schedmd.com/show_bug.cgi?id=14298] +include("runtests_helpers.jl") +job_id = "SLURM_JOB_ID" in keys(ENV) ? ENV["SLURM_JOB_ID"] : nothing +if !isnothing(job_id) && is_interactive(job_id) + slurm_version = get_slurm_version(;fail_on_error = false, verbose = true) + if slurm_version >= (22, 5, 0) + @warn("Slurm_version = $(join(slurm_version, ".")) | Modern Slurm (≥22.05.0) does not allow running sbatch jobs that contain srun, mpirun, mpiexec commands if those jobs are submitted from within an interactive srun job. Run from a non-interactive session. Skipping tests...") + # @test false # force test to fail [leave off by default in case this is part of a larger test suite/CI pipeline] + exit(0) + end +end + # submit job # project should point to top level dir so that SlurmClusterManager is available to script.jl project_path = abspath(joinpath(@__DIR__, "..")) println("project_path = $project_path") +# Test workers with JULIA_PROJECT set in env jobid = withenv("JULIA_PROJECT"=>project_path) do - read(`sbatch --export=ALL --parsable -n 4 -o test.out script.jl`, String) + read(`sbatch --export=ALL --time=0:02:00 --parsable -n 2 -N 2 -o test.out script.jl`, String) # test a minimal config. end println("jobid = $jobid") @@ -36,7 +49,10 @@ println(output) state = getjobstate(jobid) |> split # length should be two because creating the workers creates a job step -@test length(state) == 2 +@test length(state) == 4 # job_id (script.jl), job_id.ba+ (batch), job_ed.ex+ (extern), jobid.0 (julia) # check that everything exited without errors @test all(state .== "COMPLETED") + +# Test workers automatically inheriting environment +include("test_inheritance/runtests_inheritance.jl") # Test workers properly inherit environment by default \ No newline at end of file diff --git a/test/runtests_helpers.jl b/test/runtests_helpers.jl new file mode 100644 index 0000000..ac80e3b --- /dev/null +++ b/test/runtests_helpers.jl @@ -0,0 +1,35 @@ +function is_interactive(job_id) + output = read(`scontrol show job $job_id`, String) + return occursin("BatchFlag=0", output) ? true : false +end + + +function get_slurm_version(; fail_on_error::Bool = true, verbose::Bool = false) + # Run the srun --version command and capture the output + try + output = read(`srun --version`, String) + + # Extract the version number using a regular expression + version_match = match(r"\b(\d+)\.(\d+)\.(\d+)\b", output) + if version_match === nothing + error("Could not extract SLURM version from: $output") + end + + # Parse the version numbers + major, minor, patch = parse.(Int, version_match.captures) + + return (major, minor, patch) + + catch e + if fail_on_error + error("Failed to determine SLURM version: $e") + else + if verbose + @error("Failed to determine SLURM version: $e") + end + return nothing + end + end +end + + diff --git a/test/script.jl b/test/script.jl index 060760a..8d37c84 100644 --- a/test/script.jl +++ b/test/script.jl @@ -1,7 +1,7 @@ #!/usr/bin/env julia using Distributed, SlurmClusterManager -addprocs(SlurmManager()) +addprocs(SlurmManager(verbose=true)) @assert nworkers() == parse(Int, ENV["SLURM_NTASKS"]) @@ -10,4 +10,4 @@ hosts = map(workers()) do id end sort!(hosts) -@assert hosts == ["c1", "c1", "c2", "c2"] +@everywhere println("Host $(myid()): $(gethostname())") # workers report in diff --git a/test/test_inheritance/Project.toml b/test/test_inheritance/Project.toml new file mode 100644 index 0000000..f778215 --- /dev/null +++ b/test/test_inheritance/Project.toml @@ -0,0 +1,4 @@ +[deps] +Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" +SlurmClusterManager = "c82cd089-7bf7-41d7-976b-6b5d413cbe0a" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/test_inheritance/runtests_inheritance.jl b/test/test_inheritance/runtests_inheritance.jl new file mode 100644 index 0000000..4fe2468 --- /dev/null +++ b/test/test_inheritance/runtests_inheritance.jl @@ -0,0 +1,46 @@ +#!/usr/bin/env julia + +using Distributed, Test, SlurmClusterManager + +# test that slurm is available +@test !(Sys.which("sinfo") === nothing) + +SlurmClusterManager_dir = pkgdir(SlurmClusterManager) +script_file = joinpath(SlurmClusterManager_dir, "test", "test_inheritance", "script_inheritance.jl") + +# submit job +# project should point to top level dir so that SlurmClusterManager is available to script.jl +project_path = @__DIR__ +println("project_path = $project_path") +log_file = joinpath(@__DIR__, "test_inheritance.out") +jobid = read(`sbatch --export=ALL --time=0:02:00 --parsable -n 2 -N 2 -o $log_file --wrap="julia --project=$project_path $script_file"`, String) # test a minimal config in a different location without env settings + +println("jobid = $jobid") + +# get job state from jobid +getjobstate = jobid -> read(`sacct -j $jobid --format=state --noheader`, String) + +# wait for job to complete +status = timedwait(60.0, pollint=1.0) do + state = getjobstate(jobid) + state == "" && return false + state = first(split(state)) # don't care about jobsteps + println("jobstate = $state") + return state == "COMPLETED" || state == "FAILED" +end + +# check that job finished running within timelimit (either completed or failed) +@test status == :ok + +# print job output +output = read(log_file, String) +println("script output:") +println(output) + +state = getjobstate(jobid) |> split +println("job state = $state") +# length should be two because creating the workers creates a job step +@test length(state) == 4 # job_id (script.jl), job_id.ba+ (batch), job_ed.ex+ (extern), jobid.0 (julia) + +# check that everything exited without errors +@test all(state .== "COMPLETED") diff --git a/test/test_inheritance/script_inheritance.jl b/test/test_inheritance/script_inheritance.jl new file mode 100644 index 0000000..b84dc98 --- /dev/null +++ b/test/test_inheritance/script_inheritance.jl @@ -0,0 +1,66 @@ +#!/usr/bin/env julia + +using Distributed, SlurmClusterManager +addprocs(SlurmManager(verbose=true)) + +@assert nworkers() == parse(Int, ENV["SLURM_NTASKS"]) + +hosts = map(workers()) do id + remotecall_fetch(() -> gethostname(), id) +end +sort!(hosts) + +@everywhere println("Host $(myid()): $(gethostname())") # I created a separate test for assuring that the env is set properly but without withenv it's hard to test -- you need another locaiton that has Distributed installed + +pathsep = Sys.iswindows() ? ";" : ":" +active_project = Base.ACTIVE_PROJECT[] # project is set in runtests_inheritance so it will have a value [returns path unlike `Base.active_project()`] + +# Check the env variables on this worker (they may have been set by user configs) +julia_depot_path_orig = ("JULIA_DEPOT_PATH" in keys(ENV)) ? ENV["JULIA_DEPOT_PATH"] : nothing +julia_load_path_orig = ("JULIA_LOAD_PATH" in keys(ENV)) ? ENV["JULIA_LOAD_PATH"] : nothing +julia_project_orig = ("JULIA_PROJECT" in keys(ENV)) ? ENV["JULIA_PROJECT"] : nothing + +# Recreate the values that are set in /src/slurmmanager.jl launch() function for a defined project but nothing in env argument to addprocs +julia_depot_path_new = join(DEPOT_PATH, pathsep) +julia_load_path_new = join(LOAD_PATH, pathsep) +julia_project_new = active_project + +@info "Active Project: $active_project" + + +# Test we get what we expect. The original worker may have env variables unset, while the new workers have them set. +# We can't guarantee which is the original worker, so we check for both cases +@everywhere begin + _julia_depot_path = ("JULIA_DEPOT_PATH" in keys(ENV)) ? ENV["JULIA_DEPOT_PATH"] : nothing + _julia_load_path = ("JULIA_LOAD_PATH" in keys(ENV)) ? ENV["JULIA_LOAD_PATH"] : nothing + _julia_project = ("JULIA_PROJECT" in keys(ENV)) ? ENV["JULIA_PROJECT"] : nothing + _active_project = Base.ACTIVE_PROJECT[] + + if !(_julia_depot_path == $julia_depot_path_orig) && !(_julia_depot_path == $julia_depot_path_new) # check for both cases + julia_depot_path_orig_interp = $julia_depot_path_orig # hack to get the variable because i can't figure out how to do the interpolation inside a string inside an @everywhere block + julia_depot_path_new_interp = $julia_depot_path_new # hack to get the variable because i can't figure out how to do the interpolation inside a string inside an @everywhere block + error("Expected ENV[\"JULIA_DEPOT_PATH\"] to match $julia_depot_path_orig_interp or $julia_depot_path_new_interp, but got $_julia_depot_path") + end + + if !(_julia_load_path == $julia_load_path_orig) && !(_julia_load_path == $julia_load_path_new) # check for both cases + julia_load_path_orig_interp = $julia_load_path_orig # hack to get the variable because i can't figure out how to do the interpolation inside a string inside an @everywhere block + julia_load_path_new_interp = $julia_load_path_new # hack to get the variable because i can't figure out how to do the interpolation inside a string inside an @everywhere block + error("Expected ENV[\"JULIA_LOAD_PATH\"] to match $julia_load_path_orig_interp or $julia_load_path_new_interp, but got $_julia_load_path") + end + + # Check the env variable for JULIA_PROJECT is set + if !(_julia_project == $julia_project_orig) && !(_julia_project == $julia_project_new) # check for both cases + julia_project_orig_interp = $julia_project_orig # hack to get the variable because i can't figure out how to do the interpolation inside a string inside an @everywhere block + julia_project_new_interp = $julia_project_new # hack to get the variable because i can't figure out how to do the interpolation inside a string inside an @everywhere block + error("Expected ENV[\"JULIA_PROJECT\"] to match $julia_project_orig_interp or $julia_project_new_interp, but got $_julia_project") + end + + # Check the active project is correctly set to what the env variable was set to + if !(_active_project == $active_project) # This should be the same on all workers + active_project_interp = $active_project # hack to get the variable because i can't figure out how to do the interpolation inside a string inside an @everywhere block + active_toml = Base.active_project() + error("Expected Base.ACTIVE_PROJECT[] to match $active_project_interp, but got $_active_project. [ Active toml from Base.active_project() = $active_toml ]") + end + + @info "Host $(myid()): $(gethostname())\nJULIA_DEPOT_PATH: $_julia_depot_path\nJULIA_LOAD_PATH: $_julia_load_path\nJULIA_PROJECT: $_julia_project\nactive_project: $_active_project\n" +end