Skip to content

Commit 0681607

Browse files
added job tests (#1)
* WIP job tests * added unit tests for Jobs * add more specificity to tests
1 parent 9150352 commit 0681607

File tree

1 file changed

+195
-0
lines changed

1 file changed

+195
-0
lines changed

tests/unit_test.py

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import sys
1717
import filecmp
1818
import os
19+
import re
1920

2021
parent = Path(__file__).resolve().parents[1]
2122
sys.path.append(str(parent) + "/src")
@@ -46,10 +47,20 @@
4647
RayClusterStatus,
4748
CodeFlareClusterStatus,
4849
)
50+
from codeflare_sdk.job.jobs import (
51+
JobDefinition,
52+
Job,
53+
DDPJobDefinition,
54+
DDPJob,
55+
torchx_runner,
56+
)
4957
import openshift
5058
from openshift import OpenShiftPythonException
5159
from openshift.selector import Selector
5260
import ray
61+
from torchx.specs import AppDryRunInfo, AppDef
62+
from torchx.runner import get_runner, Runner
63+
from torchx.schedulers.ray_scheduler import RayJob
5364
import pytest
5465

5566

@@ -1521,6 +1532,7 @@ def test_cluster_status(mocker):
15211532
mocker.patch(
15221533
"codeflare_sdk.cluster.cluster._ray_cluster_status", return_value=fake_ray
15231534
)
1535+
15241536
status, ready = cf.status()
15251537
assert status == CodeFlareClusterStatus.STARTING
15261538
assert ready == False
@@ -1575,3 +1587,186 @@ def test_cmd_line_generation():
15751587
)
15761588
os.remove("unit-test-cluster.yaml")
15771589
os.remove("unit-cmd-cluster.yaml")
1590+
1591+
1592+
def test_jobdefinition_coverage():
1593+
abstract = JobDefinition()
1594+
cluster = Cluster(test_config_creation())
1595+
abstract._dry_run(cluster)
1596+
abstract.submit(cluster)
1597+
1598+
1599+
def test_job_coverage():
1600+
abstract = Job()
1601+
abstract.status()
1602+
abstract.logs()
1603+
1604+
1605+
def test_DDPJobDefinition_creation():
1606+
ddp = DDPJobDefinition(
1607+
script="test.py",
1608+
m=None,
1609+
script_args=["test"],
1610+
name="test",
1611+
cpu=1,
1612+
gpu=0,
1613+
memMB=1024,
1614+
h=None,
1615+
j="2x1",
1616+
env={"test": "test"},
1617+
max_retries=0,
1618+
mounts=[],
1619+
rdzv_port=29500,
1620+
scheduler_args={"requirements": "test"},
1621+
)
1622+
assert ddp.script == "test.py"
1623+
assert ddp.m == None
1624+
assert ddp.script_args == ["test"]
1625+
assert ddp.name == "test"
1626+
assert ddp.cpu == 1
1627+
assert ddp.gpu == 0
1628+
assert ddp.memMB == 1024
1629+
assert ddp.h == None
1630+
assert ddp.j == "2x1"
1631+
assert ddp.env == {"test": "test"}
1632+
assert ddp.max_retries == 0
1633+
assert ddp.mounts == []
1634+
assert ddp.rdzv_port == 29500
1635+
assert ddp.scheduler_args == {"requirements": "test"}
1636+
return ddp
1637+
1638+
1639+
def test_DDPJobDefinition_dry_run():
1640+
"""
1641+
Test that the dry run method returns the correct type: AppDryRunInfo,
1642+
that the attributes of the returned object are of the correct type,
1643+
and that the values from cluster and job definition are correctly passed.
1644+
"""
1645+
ddp = test_DDPJobDefinition_creation()
1646+
cluster = Cluster(test_config_creation())
1647+
ddp_job = ddp._dry_run(cluster)
1648+
assert type(ddp_job) == AppDryRunInfo
1649+
assert ddp_job._fmt is not None
1650+
assert type(ddp_job.request) == RayJob
1651+
assert type(ddp_job._app) == AppDef
1652+
assert type(ddp_job._cfg) == type(dict())
1653+
assert type(ddp_job._scheduler) == type(str())
1654+
1655+
assert ddp_job.request.app_id.startswith("test")
1656+
assert ddp_job.request.working_dir.startswith("/tmp/torchx_workspace")
1657+
assert ddp_job.request.cluster_name == "unit-test-cluster"
1658+
assert ddp_job.request.requirements == "test"
1659+
1660+
assert ddp_job._app.roles[0].resource.cpu == 1
1661+
assert ddp_job._app.roles[0].resource.gpu == 0
1662+
assert ddp_job._app.roles[0].resource.memMB == 1024
1663+
1664+
assert ddp_job._cfg["cluster_name"] == "unit-test-cluster"
1665+
assert ddp_job._cfg["requirements"] == "test"
1666+
1667+
assert ddp_job._scheduler == "ray"
1668+
1669+
1670+
def test_DDPJobDefinition_dry_run_no_resource_args():
1671+
"""
1672+
Test that the dry run correctly gets resources from the cluster object
1673+
when the job definition does not specify resources.
1674+
"""
1675+
cluster = Cluster(test_config_creation())
1676+
ddp = DDPJobDefinition(
1677+
script="test.py",
1678+
m=None,
1679+
script_args=["test"],
1680+
name="test",
1681+
h=None,
1682+
env={"test": "test"},
1683+
max_retries=0,
1684+
mounts=[],
1685+
rdzv_port=29500,
1686+
scheduler_args={"requirements": "test"},
1687+
)
1688+
ddp_job = ddp._dry_run(cluster)
1689+
1690+
assert ddp_job._app.roles[0].resource.cpu == cluster.config.max_cpus
1691+
assert ddp_job._app.roles[0].resource.gpu == cluster.config.gpu
1692+
assert ddp_job._app.roles[0].resource.memMB == cluster.config.max_memory * 1024
1693+
assert (
1694+
parse_j(ddp_job._app.roles[0].args[1])
1695+
== f"{cluster.config.max_worker}x{cluster.config.gpu}"
1696+
)
1697+
1698+
1699+
def test_DDPJobDefinition_submit(mocker):
1700+
"""
1701+
Tests that the submit method returns the correct type: DDPJob
1702+
And that the attributes of the returned object are of the correct type
1703+
"""
1704+
ddp_def = test_DDPJobDefinition_creation()
1705+
cluster = Cluster(test_config_creation())
1706+
mocker.patch(
1707+
"codeflare_sdk.job.jobs.torchx_runner.schedule",
1708+
return_value="fake-dashboard-url",
1709+
) # a fake app_handle
1710+
ddp_job = ddp_def.submit(cluster)
1711+
assert type(ddp_job) == DDPJob
1712+
assert type(ddp_job.job_definition) == DDPJobDefinition
1713+
assert type(ddp_job.cluster) == Cluster
1714+
assert type(ddp_job._app_handle) == str
1715+
assert ddp_job._app_handle == "fake-dashboard-url"
1716+
1717+
1718+
def test_DDPJob_creation(mocker):
1719+
ddp_def = test_DDPJobDefinition_creation()
1720+
cluster = Cluster(test_config_creation())
1721+
mocker.patch(
1722+
"codeflare_sdk.job.jobs.torchx_runner.schedule",
1723+
return_value="fake-dashboard-url",
1724+
) # a fake app_handle
1725+
ddp_job = DDPJob(ddp_def, cluster)
1726+
assert type(ddp_job) == DDPJob
1727+
assert type(ddp_job.job_definition) == DDPJobDefinition
1728+
assert type(ddp_job.cluster) == Cluster
1729+
assert type(ddp_job._app_handle) == str
1730+
assert ddp_job._app_handle == "fake-dashboard-url"
1731+
_, args, kwargs = torchx_runner.schedule.mock_calls[0]
1732+
assert type(args[0]) == AppDryRunInfo
1733+
job_info = args[0]
1734+
assert type(job_info.request) == RayJob
1735+
assert type(job_info._app) == AppDef
1736+
assert type(job_info._cfg) == type(dict())
1737+
assert type(job_info._scheduler) == type(str())
1738+
return ddp_job
1739+
1740+
1741+
def test_DDPJob_status(mocker):
1742+
ddp_job = test_DDPJob_creation(mocker)
1743+
mocker.patch(
1744+
"codeflare_sdk.job.jobs.torchx_runner.status", return_value="fake-status"
1745+
)
1746+
assert ddp_job.status() == "fake-status"
1747+
_, args, kwargs = torchx_runner.status.mock_calls[0]
1748+
assert args[0] == "fake-dashboard-url"
1749+
1750+
1751+
def test_DDPJob_logs(mocker):
1752+
ddp_job = test_DDPJob_creation(mocker)
1753+
mocker.patch(
1754+
"codeflare_sdk.job.jobs.torchx_runner.log_lines", return_value="fake-logs"
1755+
)
1756+
assert ddp_job.logs() == "fake-logs"
1757+
_, args, kwargs = torchx_runner.log_lines.mock_calls[0]
1758+
assert args[0] == "fake-dashboard-url"
1759+
1760+
1761+
def parse_j(cmd):
1762+
1763+
pattern = r"--nnodes\s+\d+\s+--nproc_per_node\s+\d+"
1764+
match = re.search(pattern, cmd)
1765+
if match:
1766+
substring = match.group(0)
1767+
else:
1768+
return None
1769+
args = substring.split()
1770+
max_worker = args[1]
1771+
gpu = args[3]
1772+
return f"{max_worker}x{gpu}"

0 commit comments

Comments
 (0)