Skip to content

Make workers inherit package environment by default, fix tests #17

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Files generated by invoking Julia with --code-coverage
*.jl.cov
*.jl.*.cov

# Files generated by invoking Julia with --track-allocation
*.jl.mem

# System-specific files and directories generated by the BinaryProvider and BinDeps packages
# They contain absolute paths specific to the host computer, and so should not be committed
deps/deps.jl
deps/build.log
deps/downloads/
deps/usr/
deps/src/

# Build artifacts for creating documentation generated by the Documenter package
docs/build/
docs/site/

# File generated by Pkg, the package manager, based on a corresponding Project.toml
# It records a fixed state of all packages used by the project. As such, it should not be
# committed for packages, but should be committed for applications that require a static
# environment.
Manifest.toml

# editor settings
.vscode
LocalPreferences.toml

# slurm logs
*.out
39 changes: 37 additions & 2 deletions src/slurmmanager.jl
Original file line number Diff line number Diff line change
Expand Up @@ -43,21 +43,54 @@ mutable struct SlurmManager <: ClusterManager
end
end

"""
By default, workers inherit the environment variables from the master process [implmentation adapted directly from https://github.com/JuliaLang/julia/pull/43270/files]
"""
function launch(manager::SlurmManager, params::Dict, instances_arr::Array, c::Condition)
try
dir = params[:dir]
exehome = params[:dir]
exename = params[:exename]
exeflags = params[:exeflags]

# TODO: Maybe this belongs in base/initdefs.jl as a package_environment() function
# together with load_path() etc. Might be useful to have when spawning julia
# processes outside of Distributed.jl too.
# JULIA_(LOAD|DEPOT)_PATH are used to populate (LOAD|DEPOT)_PATH on startup,
# but since (LOAD|DEPOT)_PATH might have changed they are re-serialized here.
# Users can opt-out of this by passing `env = ...` to addprocs(...).
env = Dict{String,String}(params[:env])
pathsep = Sys.iswindows() ? ";" : ":"
if get(env, "JULIA_LOAD_PATH", nothing) === nothing
env["JULIA_LOAD_PATH"] = join(LOAD_PATH, pathsep)
end
if get(env, "JULIA_DEPOT_PATH", nothing) === nothing
env["JULIA_DEPOT_PATH"] = join(DEPOT_PATH, pathsep)
end

# is this necessary?
if !params[:enable_threaded_blas] &&
get(env, "OPENBLAS_NUM_THREADS", nothing) === nothing
env["OPENBLAS_NUM_THREADS"] = "1"
end

# Set the active project on workers using JULIA_PROJECT.
# Users can opt-out of this by (i) passing `env = ...` or (ii) passing
# `--project=...` as `exeflags` to addprocs(...).
project = Base.ACTIVE_PROJECT[]
if project !== nothing && get(env, "JULIA_PROJECT", nothing) === nothing
env["JULIA_PROJECT"] = project
end

# Pass cookie as stdin to srun; srun forwards stdin to process
# This way the cookie won't be visible in ps, top, etc on the compute node
srun_cmd = `srun -D $exehome $exename $exeflags --worker`
manager.srun_proc = open(srun_cmd, write=true, read=true)
manager.srun_proc = open(setenv(addenv(srun_cmd, env), dir=dir), write=true, read=true)
write(manager.srun_proc, cluster_cookie())
write(manager.srun_proc, "\n")

t = @async for i in 1:manager.ntasks
manager.verbose && println("connecting to worker $i out of $(manager.ntasks)")
manager.verbose && @info "connecting to worker $i out of $(manager.ntasks)"

line = readline(manager.srun_proc)
m = match(r".*:(\d*)#(.*)", line)
Expand All @@ -67,6 +100,8 @@ function launch(manager::SlurmManager, params::Dict, instances_arr::Array, c::Co
config.port = parse(Int, m[1])
config.host = strip(m[2])

manager.verbose && @info "Worker $i ready on host $(config.host), port $(config.port)"

push!(instances_arr, config)
notify(c)
end
Expand Down
20 changes: 18 additions & 2 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,25 @@ using Distributed, Test, SlurmClusterManager
# test that slurm is available
@test !(Sys.which("sinfo") === nothing)

# NOTE: If in an interactive allocation (not sure how to check and avoid), the sbatch command below will fail on recent slurm versions. See [https://support.schedmd.com/show_bug.cgi?id=14298]
include("runtests_helpers.jl")
job_id = "SLURM_JOB_ID" in keys(ENV) ? ENV["SLURM_JOB_ID"] : nothing
if !isnothing(job_id) && is_interactive(job_id)
slurm_version = get_slurm_version(;fail_on_error = false, verbose = true)
if slurm_version >= (22, 5, 0)
@warn("Slurm_version = $(join(slurm_version, ".")) | Modern Slurm (≥22.05.0) does not allow running sbatch jobs that contain srun, mpirun, mpiexec commands if those jobs are submitted from within an interactive srun job. Run from a non-interactive session. Skipping tests...")
# @test false # force test to fail [leave off by default in case this is part of a larger test suite/CI pipeline]
exit(0)
end
end

# submit job
# project should point to top level dir so that SlurmClusterManager is available to script.jl
project_path = abspath(joinpath(@__DIR__, ".."))
println("project_path = $project_path")
# Test workers with JULIA_PROJECT set in env
jobid = withenv("JULIA_PROJECT"=>project_path) do
read(`sbatch --export=ALL --parsable -n 4 -o test.out script.jl`, String)
read(`sbatch --export=ALL --time=0:02:00 --parsable -n 2 -N 2 -o test.out script.jl`, String) # test a minimal config.
end
println("jobid = $jobid")

Expand All @@ -36,7 +49,10 @@ println(output)

state = getjobstate(jobid) |> split
# length should be two because creating the workers creates a job step
@test length(state) == 2
@test length(state) == 4 # job_id (script.jl), job_id.ba+ (batch), job_ed.ex+ (extern), jobid.0 (julia)

# check that everything exited without errors
@test all(state .== "COMPLETED")

# Test workers automatically inheriting environment
include("test_inheritance/runtests_inheritance.jl") # Test workers properly inherit environment by default
35 changes: 35 additions & 0 deletions test/runtests_helpers.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
function is_interactive(job_id)
output = read(`scontrol show job $job_id`, String)
return occursin("BatchFlag=0", output) ? true : false
end


function get_slurm_version(; fail_on_error::Bool = true, verbose::Bool = false)
# Run the srun --version command and capture the output
try
output = read(`srun --version`, String)

# Extract the version number using a regular expression
version_match = match(r"\b(\d+)\.(\d+)\.(\d+)\b", output)
if version_match === nothing
error("Could not extract SLURM version from: $output")
end

# Parse the version numbers
major, minor, patch = parse.(Int, version_match.captures)

return (major, minor, patch)

catch e
if fail_on_error
error("Failed to determine SLURM version: $e")
else
if verbose
@error("Failed to determine SLURM version: $e")
end
return nothing
end
end
end


4 changes: 2 additions & 2 deletions test/script.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env julia

using Distributed, SlurmClusterManager
addprocs(SlurmManager())
addprocs(SlurmManager(verbose=true))

@assert nworkers() == parse(Int, ENV["SLURM_NTASKS"])

Expand All @@ -10,4 +10,4 @@ hosts = map(workers()) do id
end
sort!(hosts)

@assert hosts == ["c1", "c1", "c2", "c2"]
@everywhere println("Host $(myid()): $(gethostname())") # workers report in
4 changes: 4 additions & 0 deletions test/test_inheritance/Project.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[deps]
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
SlurmClusterManager = "c82cd089-7bf7-41d7-976b-6b5d413cbe0a"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
46 changes: 46 additions & 0 deletions test/test_inheritance/runtests_inheritance.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env julia

using Distributed, Test, SlurmClusterManager

# test that slurm is available
@test !(Sys.which("sinfo") === nothing)

SlurmClusterManager_dir = pkgdir(SlurmClusterManager)
script_file = joinpath(SlurmClusterManager_dir, "test", "test_inheritance", "script_inheritance.jl")

# submit job
# project should point to top level dir so that SlurmClusterManager is available to script.jl
project_path = @__DIR__
println("project_path = $project_path")
log_file = joinpath(@__DIR__, "test_inheritance.out")
jobid = read(`sbatch --export=ALL --time=0:02:00 --parsable -n 2 -N 2 -o $log_file --wrap="julia --project=$project_path $script_file"`, String) # test a minimal config in a different location without env settings

println("jobid = $jobid")

# get job state from jobid
getjobstate = jobid -> read(`sacct -j $jobid --format=state --noheader`, String)

# wait for job to complete
status = timedwait(60.0, pollint=1.0) do
state = getjobstate(jobid)
state == "" && return false
state = first(split(state)) # don't care about jobsteps
println("jobstate = $state")
return state == "COMPLETED" || state == "FAILED"
end

# check that job finished running within timelimit (either completed or failed)
@test status == :ok

# print job output
output = read(log_file, String)
println("script output:")
println(output)

state = getjobstate(jobid) |> split
println("job state = $state")
# length should be two because creating the workers creates a job step
@test length(state) == 4 # job_id (script.jl), job_id.ba+ (batch), job_ed.ex+ (extern), jobid.0 (julia)

# check that everything exited without errors
@test all(state .== "COMPLETED")
66 changes: 66 additions & 0 deletions test/test_inheritance/script_inheritance.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/usr/bin/env julia

using Distributed, SlurmClusterManager
addprocs(SlurmManager(verbose=true))

@assert nworkers() == parse(Int, ENV["SLURM_NTASKS"])

hosts = map(workers()) do id
remotecall_fetch(() -> gethostname(), id)
end
sort!(hosts)

@everywhere println("Host $(myid()): $(gethostname())") # I created a separate test for assuring that the env is set properly but without withenv it's hard to test -- you need another locaiton that has Distributed installed

pathsep = Sys.iswindows() ? ";" : ":"
active_project = Base.ACTIVE_PROJECT[] # project is set in runtests_inheritance so it will have a value [returns path unlike `Base.active_project()`]

# Check the env variables on this worker (they may have been set by user configs)
julia_depot_path_orig = ("JULIA_DEPOT_PATH" in keys(ENV)) ? ENV["JULIA_DEPOT_PATH"] : nothing
julia_load_path_orig = ("JULIA_LOAD_PATH" in keys(ENV)) ? ENV["JULIA_LOAD_PATH"] : nothing
julia_project_orig = ("JULIA_PROJECT" in keys(ENV)) ? ENV["JULIA_PROJECT"] : nothing

# Recreate the values that are set in /src/slurmmanager.jl launch() function for a defined project but nothing in env argument to addprocs
julia_depot_path_new = join(DEPOT_PATH, pathsep)
julia_load_path_new = join(LOAD_PATH, pathsep)
julia_project_new = active_project

@info "Active Project: $active_project"


# Test we get what we expect. The original worker may have env variables unset, while the new workers have them set.
# We can't guarantee which is the original worker, so we check for both cases
@everywhere begin
_julia_depot_path = ("JULIA_DEPOT_PATH" in keys(ENV)) ? ENV["JULIA_DEPOT_PATH"] : nothing
_julia_load_path = ("JULIA_LOAD_PATH" in keys(ENV)) ? ENV["JULIA_LOAD_PATH"] : nothing
_julia_project = ("JULIA_PROJECT" in keys(ENV)) ? ENV["JULIA_PROJECT"] : nothing
_active_project = Base.ACTIVE_PROJECT[]

if !(_julia_depot_path == $julia_depot_path_orig) && !(_julia_depot_path == $julia_depot_path_new) # check for both cases
julia_depot_path_orig_interp = $julia_depot_path_orig # hack to get the variable because i can't figure out how to do the interpolation inside a string inside an @everywhere block
julia_depot_path_new_interp = $julia_depot_path_new # hack to get the variable because i can't figure out how to do the interpolation inside a string inside an @everywhere block
error("Expected ENV[\"JULIA_DEPOT_PATH\"] to match $julia_depot_path_orig_interp or $julia_depot_path_new_interp, but got $_julia_depot_path")
end

if !(_julia_load_path == $julia_load_path_orig) && !(_julia_load_path == $julia_load_path_new) # check for both cases
julia_load_path_orig_interp = $julia_load_path_orig # hack to get the variable because i can't figure out how to do the interpolation inside a string inside an @everywhere block
julia_load_path_new_interp = $julia_load_path_new # hack to get the variable because i can't figure out how to do the interpolation inside a string inside an @everywhere block
error("Expected ENV[\"JULIA_LOAD_PATH\"] to match $julia_load_path_orig_interp or $julia_load_path_new_interp, but got $_julia_load_path")
end

# Check the env variable for JULIA_PROJECT is set
if !(_julia_project == $julia_project_orig) && !(_julia_project == $julia_project_new) # check for both cases
julia_project_orig_interp = $julia_project_orig # hack to get the variable because i can't figure out how to do the interpolation inside a string inside an @everywhere block
julia_project_new_interp = $julia_project_new # hack to get the variable because i can't figure out how to do the interpolation inside a string inside an @everywhere block
error("Expected ENV[\"JULIA_PROJECT\"] to match $julia_project_orig_interp or $julia_project_new_interp, but got $_julia_project")
end

# Check the active project is correctly set to what the env variable was set to
if !(_active_project == $active_project) # This should be the same on all workers
active_project_interp = $active_project # hack to get the variable because i can't figure out how to do the interpolation inside a string inside an @everywhere block
active_toml = Base.active_project()
error("Expected Base.ACTIVE_PROJECT[] to match $active_project_interp, but got $_active_project. [ Active toml from Base.active_project() = $active_toml ]")
end

@info "Host $(myid()): $(gethostname())\nJULIA_DEPOT_PATH: $_julia_depot_path\nJULIA_LOAD_PATH: $_julia_load_path\nJULIA_PROJECT: $_julia_project\nactive_project: $_active_project\n"
end
Loading