From bfcbbcaf5ab65e84a77d24419d59927e1a7a6f8b Mon Sep 17 00:00:00 2001 From: maxusmusti Date: Tue, 4 Apr 2023 14:48:51 -0400 Subject: [PATCH 1/4] Fixed existing demo outputs --- demo-notebooks/batch-job/batch_mnist.ipynb | 4986 +++++++++++++++++++- 1 file changed, 4940 insertions(+), 46 deletions(-) diff --git a/demo-notebooks/batch-job/batch_mnist.ipynb b/demo-notebooks/batch-job/batch_mnist.ipynb index 4d434640d..e6c58027e 100644 --- a/demo-notebooks/batch-job/batch_mnist.ipynb +++ b/demo-notebooks/batch-job/batch_mnist.ipynb @@ -14,15 +14,15 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "614daa0c", "metadata": {}, "outputs": [], "source": [ "# Create authentication object for oc user permissions\n", "auth = TokenAuthentication(\n", - " token = \"XXXX\",\n", - " server = \"XXXX\",\n", + " token = \"XXXXX\",\n", + " server = \"XXXXX\",\n", " skip_tls=True\n", ")\n", "auth.login()" @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "0f4bc870-091f-4e11-9642-cba145710159", "metadata": {}, "outputs": [ @@ -65,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "f0884bbc-c224-4ca0-98a0-02dfa09c2200", "metadata": {}, "outputs": [], @@ -85,20 +85,50 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "id": "3c1b4311-2e61-44c9-8225-87c2db11363d", "metadata": {}, "outputs": [ { - "ename": "AttributeError", - "evalue": "'Cluster' object has no attribute 'is_ready'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[11], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mcluster\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mis_ready\u001b[49m()\n", - "\u001b[0;31mAttributeError\u001b[0m: 'Cluster' object has no attribute 'is_ready'" - ] + "data": { + "text/html": [ + "
╭─────────────────────────╮\n",
+       "│     🚀 Cluster Queue    │\n",
+       "│        Status 🚀        │\n",
+       "│ +-----------+---------+ │\n",
+       "│ | Name      | Status  | │\n",
+       "│ +===========+=========+ │\n",
+       "│ | mnisttest | pending | │\n",
+       "│ |           |         | │\n",
+       "│ +-----------+---------+ │\n",
+       "╰─────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "╭─────────────────────────╮\n", + "│ \u001b[3m \u001b[0m\u001b[1;3m 🚀 Cluster Queue\u001b[0m\u001b[3m \u001b[0m │\n", + "│ \u001b[3m \u001b[0m\u001b[1;3mStatus 🚀\u001b[0m\u001b[3m \u001b[0m │\n", + "│ +-----------+---------+ │\n", + "│ |\u001b[1m \u001b[0m\u001b[1mName \u001b[0m\u001b[1m \u001b[0m|\u001b[1m \u001b[0m\u001b[1mStatus \u001b[0m\u001b[1m \u001b[0m| │\n", + "│ +===========+=========+ │\n", + "│ |\u001b[36m \u001b[0m\u001b[36mmnisttest\u001b[0m\u001b[36m \u001b[0m|\u001b[35m \u001b[0m\u001b[35mpending\u001b[0m\u001b[35m \u001b[0m| │\n", + "│ |\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m|\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m| │\n", + "│ +-----------+---------+ │\n", + "╰─────────────────────────╯\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "(, False)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -107,20 +137,73 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "a99d5aff", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Waiting for requested resources to be set up...\n", + "Requested cluster up and running!\n" + ] + } + ], "source": [ "cluster.wait_ready()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "df71c1ed", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
                   🚀 CodeFlare Cluster Status 🚀                   \n",
+       "                                                                    \n",
+       " ╭────────────────────────────────────────────────────────────────╮ \n",
+       " │   Name                                                         │ \n",
+       " │   mnisttest                                        Active ✅   │ \n",
+       " │                                                                │ \n",
+       " │   URI: ray://mnisttest-head-svc.default.svc:10001              │ \n",
+       " │                                                                │ \n",
+       " │   Dashboard🔗                                                  │ \n",
+       " │                                                                │ \n",
+       " ╰────────────────────────────────────────────────────────────────╯ \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m \u001b[0m\u001b[1;3m 🚀 CodeFlare Cluster Status 🚀\u001b[0m\u001b[3m \u001b[0m\n", + "\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\n", + " ╭────────────────────────────────────────────────────────────────╮ \n", + " │ \u001b[1;37;42mName\u001b[0m │ \n", + " │ \u001b[1;4mmnisttest\u001b[0m Active ✅ │ \n", + " │ │ \n", + " │ \u001b[1mURI:\u001b[0m ray://mnisttest-head-svc.default.svc:10001 │ \n", + " │ │ \n", + " │ \u001b]8;id=790579;ray-dashboard-mnisttest-default.apps.meyceoz-032023.psap.aws.rhperfscale.org\u001b\\\u001b[4;34mDashboard🔗\u001b[0m\u001b]8;;\u001b\\ │ \n", + " │ │ \n", + " ╰────────────────────────────────────────────────────────────────╯ \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "(, True)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "cluster.status()" ] @@ -136,14 +219,14 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "id": "7fd45bc5-03c0-4ae5-9ec5-dd1c30f1a084", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
                   🚀 CodeFlare Cluster Status 🚀                   \n",
+       "
                   🚀 CodeFlare Cluster Details 🚀                  \n",
        "                                                                    \n",
        " ╭────────────────────────────────────────────────────────────────╮ \n",
        " │   Name                                                         │ \n",
@@ -151,13 +234,20 @@
        " │                                                                │ \n",
        " │   URI: ray://mnisttest-head-svc.default.svc:10001              │ \n",
        " │                                                                │ \n",
-       " │   Dashboard🔗                                                  │ \n",
+       " │   Dashboard🔗                                                  │ \n",
        " │                                                                │ \n",
+       " │                      Cluster Resources                         │ \n",
+       " │   ╭─ Workers ──╮  ╭───────── Worker specs(each) ─────────╮     │ \n",
+       " │   │  Min  Max  │  │  Memory      CPU         GPU         │     │ \n",
+       " │   │            │  │                                      │     │ \n",
+       " │   │  2    2    │  │  8~8         2           0           │     │ \n",
+       " │   │            │  │                                      │     │ \n",
+       " │   ╰────────────╯  ╰──────────────────────────────────────╯     │ \n",
        " ╰────────────────────────────────────────────────────────────────╯ \n",
        "
\n" ], "text/plain": [ - "\u001b[3m \u001b[0m\u001b[1;3m 🚀 CodeFlare Cluster Status 🚀\u001b[0m\u001b[3m \u001b[0m\n", + "\u001b[3m \u001b[0m\u001b[1;3m 🚀 CodeFlare Cluster Details 🚀\u001b[0m\u001b[3m \u001b[0m\n", "\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\n", " ╭────────────────────────────────────────────────────────────────╮ \n", " │ \u001b[1;37;42mName\u001b[0m │ \n", @@ -165,8 +255,15 @@ " │ │ \n", " │ \u001b[1mURI:\u001b[0m ray://mnisttest-head-svc.default.svc:10001 │ \n", " │ │ \n", - " │ \u001b]8;id=464037;ray-dashboard-mnisttest-default.apps.kpostoffice.dev.datahub.redhat.com\u001b\\\u001b[4;34mDashboard🔗\u001b[0m\u001b]8;;\u001b\\ │ \n", + " │ \u001b]8;id=84168;http://ray-dashboard-mnisttest-default.apps.meyceoz-032023.psap.aws.rhperfscale.org\u001b\\\u001b[4;34mDashboard🔗\u001b[0m\u001b]8;;\u001b\\ │ \n", " │ │ \n", + " │ \u001b[3m Cluster Resources \u001b[0m │ \n", + " │ ╭─ Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n", + " │ │ \u001b[1m \u001b[0m\u001b[1mMin\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mMax\u001b[0m\u001b[1m \u001b[0m │ │ \u001b[1m \u001b[0m\u001b[1mMemory \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mCPU \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mGPU \u001b[0m\u001b[1m \u001b[0m │ │ \n", + " │ │ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \n", + " │ │ \u001b[36m \u001b[0m\u001b[36m2 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m2 \u001b[0m\u001b[35m \u001b[0m │ │ \u001b[36m \u001b[0m\u001b[36m8~8 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m2 \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m0 \u001b[0m\u001b[35m \u001b[0m │ │ \n", + " │ │ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \n", + " │ ╰────────────╯ ╰──────────────────────────────────────╯ │ \n", " ╰────────────────────────────────────────────────────────────────╯ \n" ] }, @@ -176,10 +273,10 @@ { "data": { "text/plain": [ - "(, True)" + "RayCluster(name='mnisttest', status=, min_workers=2, max_workers=2, worker_mem_min=8, worker_mem_max=8, worker_cpu=2, worker_gpu=0, namespace='default', dashboard='http://ray-dashboard-mnisttest-default.apps.meyceoz-032023.psap.aws.rhperfscale.org')" ] }, - "execution_count": 5, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -198,7 +295,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "id": "3cc6183a-8f6e-4347-af91-d088ed422544", "metadata": {}, "outputs": [], @@ -216,7 +313,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 10, "id": "ced6ccd6-a17e-413a-a0e4-65004fc35463", "metadata": { "scrolled": true @@ -236,7 +333,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 20, "id": "e5c0b0da-c22e-4142-b096-407ac8aebe5e", "metadata": {}, "outputs": [ @@ -245,7 +342,7 @@ "text/plain": [ "AppStatus:\n", " msg: !!python/object/apply:ray.dashboard.modules.job.common.JobStatus\n", - " - FAILED\n", + " - RUNNING\n", " num_restarts: -1\n", " roles:\n", " - replicas:\n", @@ -253,15 +350,15 @@ " id: 0\n", " role: ray\n", " state: !!python/object/apply:torchx.specs.api.AppState\n", - " - 5\n", + " - 3\n", " structured_error_msg: \n", " role: ray\n", - " state: FAILED (5)\n", + " state: RUNNING (3)\n", " structured_error_msg: \n", " ui_url: null" ] }, - "execution_count": 7, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -272,19 +369,4816 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 25, "id": "264c1809-de72-4acf-b0f6-e67d345640f6", "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "'[RayActor(name=\\'mnist\\', command=[\\'bash\\', \\'-c\\', \"python -m torch.distributed.run --rdzv_backend static --rdzv_endpoint $TORCHX_RANK0_HOST:49782 --rdzv_id \\'mnist-nm426236fqknz\\' --nnodes 2 --nproc_per_node 1 --node_rank \\'0\\' --tee 3 --role \\'\\' mnist.py\"], env={\\'LOGLEVEL\\': \\'DEBUG\\', \\'TORCH_DISTRIBUTED_DEBUG\\': \\'DETAIL\\', \\'TORCHX_JOB_ID\\': \\'ray://torchx/mnist-nm426236fqknz\\'}, num_cpus=1, num_gpus=0, min_replicas=2), RayActor(name=\\'mnist\\', command=[\\'bash\\', \\'-c\\', \"python -m torch.distributed.run --rdzv_backend static --rdzv_endpoint $TORCHX_RANK0_HOST:49782 --rdzv_id \\'mnist-nm426236fqknz\\' --nnodes 2 --nproc_per_node 1 --node_rank \\'1\\' --tee 3 --role \\'\\' mnist.py\"], env={\\'LOGLEVEL\\': \\'DEBUG\\', \\'TORCH_DISTRIBUTED_DEBUG\\': \\'DETAIL\\', \\'TORCHX_JOB_ID\\': \\'ray://torchx/mnist-nm426236fqknz\\'}, num_cpus=1, num_gpus=0, min_replicas=2)]\\n2023-03-03 13:08:48,406\\tINFO worker.py:1230 -- Using address 10.129.2.222:6379 set in the environment variable RAY_ADDRESS\\n2023-03-03 13:08:48,406\\tINFO worker.py:1342 -- Connecting to existing Ray cluster at address: 10.129.2.222:6379...\\n2023-03-03 13:08:48,413\\tINFO worker.py:1519 -- Connected to Ray cluster. View the dashboard at \\x1b[1m\\x1b[32mhttp://10.129.2.222:8265 \\x1b[39m\\x1b[22m\\nWaiting for minimum placement group to start.\\nSuccessfully created placement groups\\nSuccessfully placed command actors\\nEntering main loop, start executing the script on worker nodes\\nrunning ray.wait on [ObjectRef(4482c0d3e15a41a806094fedcf394a8e91a4a2b10500000001000000), ObjectRef(9f79440f8f098da04ea4d79eb7ab2872e23b3c2b0500000001000000)]\\nrunning ray.wait on [ObjectRef(4482c0d3e15a41a806094fedcf394a8e91a4a2b10500000001000000), ObjectRef(d980cb79d3ebbeee4ea4d79eb7ab2872e23b3c2b0500000001000000)]\\nTraceback (most recent call last):\\n File \"ray_driver.py\", line 312, in \\n main()\\n File \"ray_driver.py\", line 308, in main\\n driver.run()\\n File \"ray_driver.py\", line 293, in run\\n terminal = self._step()\\n File \"ray_driver.py\", line 245, in _step\\n result = ray.get(object_ref)\\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/client_mode_hook.py\", line 105, in wrapper\\n return func(*args, **kwargs)\\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/worker.py\", line 2289, in get\\n raise value.as_instanceof_cause()\\nray.exceptions.RayTaskError(RuntimeError): \\x1b[36mray::CommandActor.exec_module()\\x1b[39m (pid=3315, ip=10.129.2.222, repr=)\\n File \"ray_driver.py\", line 76, in exec_module\\n raise RuntimeError(\\nRuntimeError: Either MASTER_ADDR or MASTER_PORT are not set. This is most likely bug in torchxOpen issue at https://github.com/pytorch/torchx\\n'" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "[RayActor(name='mnist', command=['bash', '-c', \"python -m torch.distributed.run --rdzv_backend static --rdzv_endpoint $TORCHX_RANK0_HOST:49782 --rdzv_id 'mnist-zvm96dmvgkq5hc' --nnodes 2 --nproc_per_node 1 --node_rank '0' --tee 3 --role '' mnist.py\"], env={'LOGLEVEL': 'DEBUG', 'TORCH_DISTRIBUTED_DEBUG': 'DETAIL', 'TORCHX_JOB_ID': 'ray://torchx/mnist-zvm96dmvgkq5hc'}, num_cpus=2, num_gpus=0, min_replicas=2), RayActor(name='mnist', command=['bash', '-c', \"python -m torch.distributed.run --rdzv_backend static --rdzv_endpoint $TORCHX_RANK0_HOST:49782 --rdzv_id 'mnist-zvm96dmvgkq5hc' --nnodes 2 --nproc_per_node 1 --node_rank '1' --tee 3 --role '' mnist.py\"], env={'LOGLEVEL': 'DEBUG', 'TORCH_DISTRIBUTED_DEBUG': 'DETAIL', 'TORCHX_JOB_ID': 'ray://torchx/mnist-zvm96dmvgkq5hc'}, num_cpus=2, num_gpus=0, min_replicas=2)]\n", + "2023-04-03 14:55:18,399\tINFO worker.py:1230 -- Using address 10.129.0.91:6379 set in the environment variable RAY_ADDRESS\n", + "2023-04-03 14:55:18,399\tINFO worker.py:1342 -- Connecting to existing Ray cluster at address: 10.129.0.91:6379...\n", + "2023-04-03 14:55:18,404\tINFO worker.py:1519 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttp://10.129.0.91:8265 \u001b[39m\u001b[22m\n", + "Waiting for minimum placement group to start.\n", + "Successfully created placement groups\n", + "rdzv_endpoint set to 10.129.0.93 for actor d45df6656fb9ae4d54e9266f02000000\n", + "rdzv_endpoint set to 10.129.0.93 for actor 1e1ce76b4ad62b80f89134cb02000000\n", + "Successfully placed command actors\n", + "Entering main loop, start executing the script on worker nodes\n", + "running ray.wait on [ObjectRef(e082c90ab8422b00d45df6656fb9ae4d54e9266f0200000001000000), ObjectRef(ce868e48e2fa9a941e1ce76b4ad62b80f89134cb0200000001000000)]\n", + "running ray.wait on [ObjectRef(ce868e48e2fa9a941e1ce76b4ad62b80f89134cb0200000001000000), ObjectRef(f81ec6ff838b16dbd45df6656fb9ae4d54e9266f0200000001000000)]\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m INFO:torch.distributed.launcher.api:Starting elastic_operator with launch configs:\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m entrypoint : mnist.py\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m min_nodes : 2\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m max_nodes : 2\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m nproc_per_node : 1\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m run_id : mnist-zvm96dmvgkq5hc\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m rdzv_backend : static\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m rdzv_endpoint : 10.129.0.93:49782\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m rdzv_configs : {'rank': 0, 'timeout': 900}\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m max_restarts : 0\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m monitor_interval : 5\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m log_dir : None\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m metrics_cfg : {}\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m \n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m INFO:torch.distributed.elastic.agent.server.local_elastic_agent:log directory set to: /tmp/torchelastic_pj6rj6_8/mnist-zvm96dmvgkq5hc_c86x83a_\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] starting workers for entrypoint: python\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Rendezvous'ing worker group\n", + "running ray.wait on [ObjectRef(f81ec6ff838b16dbd45df6656fb9ae4d54e9266f0200000001000000), ObjectRef(32b0eec39cfa87ac1e1ce76b4ad62b80f89134cb0200000001000000)]\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m INFO:torch.distributed.launcher.api:Starting elastic_operator with launch configs:\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m entrypoint : mnist.py\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m min_nodes : 2\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m max_nodes : 2\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m nproc_per_node : 1\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m run_id : mnist-zvm96dmvgkq5hc\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m rdzv_backend : static\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m rdzv_endpoint : 10.129.0.93:49782\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m rdzv_configs : {'rank': 1, 'timeout': 900}\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m max_restarts : 0\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m monitor_interval : 5\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m log_dir : None\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m metrics_cfg : {}\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m \n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m INFO:torch.distributed.elastic.agent.server.local_elastic_agent:log directory set to: /tmp/torchelastic_l3wjw627/mnist-zvm96dmvgkq5hc_vpc42a2t\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] starting workers for entrypoint: python\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Rendezvous'ing worker group\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Rendezvous complete for workers. Result:\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m restart_count=0\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m master_addr=10.129.0.93\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m master_port=49782\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m group_rank=1\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m group_world_size=2\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m local_ranks=[0]\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m role_ranks=[1]\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m global_ranks=[1]\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m role_world_sizes=[2]\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m global_world_sizes=[2]\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m \n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Starting worker group\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker0 reply file to: /tmp/torchelastic_l3wjw627/mnist-zvm96dmvgkq5hc_vpc42a2t/attempt_0/0/error.json\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Rendezvous complete for workers. Result:\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m restart_count=0\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m master_addr=10.129.0.93\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m master_port=49782\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m group_rank=0\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m group_world_size=2\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m local_ranks=[0]\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m role_ranks=[0]\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m global_ranks=[0]\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m role_world_sizes=[2]\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m global_world_sizes=[2]\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m \n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Starting worker group\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker0 reply file to: /tmp/torchelastic_pj6rj6_8/mnist-zvm96dmvgkq5hc_c86x83a_/attempt_0/0/error.json\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:prior to running the trainer\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:MASTER_ADDR: is 10.129.0.93\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:MASTER_PORT: is 49782\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:GROUP: 2\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:LOCAL: 1\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Downloading MNIST dataset...\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./MNIST/raw/train-images-idx3-ubyte.gz\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Extracting ./MNIST/raw/train-images-idx3-ubyte.gz to ./MNIST/raw\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./MNIST/raw/train-labels-idx1-ubyte.gz\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Extracting ./MNIST/raw/train-labels-idx1-ubyte.gz to ./MNIST/raw\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./MNIST/raw/t10k-images-idx3-ubyte.gz\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Extracting ./MNIST/raw/t10k-images-idx3-ubyte.gz to ./MNIST/raw\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./MNIST/raw/t10k-labels-idx1-ubyte.gz\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Extracting ./MNIST/raw/t10k-labels-idx1-ubyte.gz to ./MNIST/raw\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Validation sanity check: 0it [00:00, ?it/s]\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Validation sanity check: 0%| | 0/2 [00:00 Date: Tue, 4 Apr 2023 17:45:31 -0400 Subject: [PATCH 2/4] Added torchx-mcad example --- .../batch-job/batch_mnist_mcad.ipynb | 3387 +++++++++++++++++ ...atch_mnist.ipynb => batch_mnist_ray.ipynb} | 18 +- 2 files changed, 3396 insertions(+), 9 deletions(-) create mode 100644 demo-notebooks/batch-job/batch_mnist_mcad.ipynb rename demo-notebooks/batch-job/{batch_mnist.ipynb => batch_mnist_ray.ipynb} (99%) diff --git a/demo-notebooks/batch-job/batch_mnist_mcad.ipynb b/demo-notebooks/batch-job/batch_mnist_mcad.ipynb new file mode 100644 index 000000000..bfd25d3ab --- /dev/null +++ b/demo-notebooks/batch-job/batch_mnist_mcad.ipynb @@ -0,0 +1,3387 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a", + "metadata": {}, + "outputs": [], + "source": [ + "# Import pieces from codeflare-sdk\n", + "from codeflare_sdk.cluster.auth import TokenAuthentication" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "9de86658", + "metadata": {}, + "source": [ + "First, we begin by authenticating using the SDK." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "614daa0c", + "metadata": {}, + "outputs": [], + "source": [ + "# Create authentication object for oc user permissions\n", + "auth = TokenAuthentication(\n", + " token = \"XXXXX\",\n", + " server = \"XXXXX\",\n", + " skip_tls=True\n", + ")\n", + "auth.login()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "87d2c9b3", + "metadata": {}, + "source": [ + "Now that we are logged in, we can directly submit our batch job (model training on two workers with four gpus each) to MCAD via torchx." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3cc6183a-8f6e-4347-af91-d088ed422544", + "metadata": {}, + "outputs": [], + "source": [ + "from codeflare_sdk.job.jobs import DDPJobDefinition" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ced6ccd6-a17e-413a-a0e4-65004fc35463", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [] + } + ], + "source": [ + "job = DDPJobDefinition(name=\"mnistjob\", script=\"mnist.py\", scheduler_args={\"namespace\": \"default\"}, j=\"1x1\", gpu=0, cpu=1, memMB=8000, image=\"quay.io/michaelclifford/mnist-test:latest\").submit()" + ] + }, + { + "cell_type": "markdown", + "id": "ff065051", + "metadata": {}, + "source": [ + "Now we can go ahead and look at the status and logs of our batch job." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e5c0b0da-c22e-4142-b096-407ac8aebe5e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/meyceoz/Documents/sdktest/lib64/python3.8/site-packages/torchx/schedulers/kubernetes_mcad_scheduler.py:1047: UserWarning: Warning - MCAD does not report individual replica statuses, but overall task status. Replica id may not match status\n", + " warnings.warn(msg)\n" + ] + }, + { + "data": { + "text/plain": [ + "AppStatus:\n", + " msg: \n", + " num_restarts: -1\n", + " roles:\n", + " - replicas:\n", + " - hostname: ''\n", + " id: 0\n", + " role: mnist\n", + " state: !!python/object/apply:torchx.specs.api.AppState\n", + " - 3\n", + " structured_error_msg: \n", + " role: mnist\n", + " state: RUNNING (3)\n", + " structured_error_msg: \n", + " ui_url: null" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "job.status()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "264c1809-de72-4acf-b0f6-e67d345640f6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-04-04T21:07:18.216609937Z INFO:torch.distributed.launcher.api:Starting elastic_operator with launch configs:\n", + "2023-04-04T21:07:18.216609937Z entrypoint : mnist.py\n", + "2023-04-04T21:07:18.216609937Z min_nodes : 1\n", + "2023-04-04T21:07:18.216609937Z max_nodes : 1\n", + "2023-04-04T21:07:18.216609937Z nproc_per_node : 1\n", + "2023-04-04T21:07:18.216609937Z run_id : mnistjob-d6hz7lmsvx4scd\n", + "2023-04-04T21:07:18.216609937Z rdzv_backend : static\n", + "2023-04-04T21:07:18.216609937Z rdzv_endpoint : localhost:49782\n", + "2023-04-04T21:07:18.216609937Z rdzv_configs : {'rank': 0, 'timeout': 900}\n", + "2023-04-04T21:07:18.216609937Z max_restarts : 0\n", + "2023-04-04T21:07:18.216609937Z monitor_interval : 5\n", + "2023-04-04T21:07:18.216609937Z log_dir : None\n", + "2023-04-04T21:07:18.216609937Z metrics_cfg : {}\n", + "2023-04-04T21:07:18.216609937Z \n", + "2023-04-04T21:07:18.217434715Z INFO:torch.distributed.elastic.agent.server.local_elastic_agent:log directory set to: /tmp/torchelastic_5_zjjbf1/mnistjob-d6hz7lmsvx4scd_a107om0q\n", + "2023-04-04T21:07:18.217476057Z INFO:torch.distributed.elastic.agent.server.api:[] starting workers for entrypoint: python\n", + "2023-04-04T21:07:18.217498330Z INFO:torch.distributed.elastic.agent.server.api:[] Rendezvous'ing worker group\n", + "2023-04-04T21:07:18.218861938Z INFO:torch.distributed.elastic.agent.server.api:[] Rendezvous complete for workers. Result:\n", + "2023-04-04T21:07:18.218861938Z restart_count=0\n", + "2023-04-04T21:07:18.218861938Z master_addr=localhost\n", + "2023-04-04T21:07:18.218861938Z master_port=49782\n", + "2023-04-04T21:07:18.218861938Z group_rank=0\n", + "2023-04-04T21:07:18.218861938Z group_world_size=1\n", + "2023-04-04T21:07:18.218861938Z local_ranks=[0]\n", + "2023-04-04T21:07:18.218861938Z role_ranks=[0]\n", + "2023-04-04T21:07:18.218861938Z global_ranks=[0]\n", + "2023-04-04T21:07:18.218861938Z role_world_sizes=[1]\n", + "2023-04-04T21:07:18.218861938Z global_world_sizes=[1]\n", + "2023-04-04T21:07:18.218861938Z \n", + "2023-04-04T21:07:18.218890288Z INFO:torch.distributed.elastic.agent.server.api:[] Starting worker group\n", + "2023-04-04T21:07:18.219122325Z INFO:torch.distributed.elastic.agent.server.local_elastic_agent:Environment variable 'TORCHELASTIC_ENABLE_FILE_TIMER' not found. Do not start FileTimerServer.\n", + "2023-04-04T21:07:18.219220551Z INFO:torch.distributed.elastic.multiprocessing:Setting worker0 reply file to: /tmp/torchelastic_5_zjjbf1/mnistjob-d6hz7lmsvx4scd_a107om0q/attempt_0/0/error.json\n", + "2023-04-04T21:07:20.356947480Z [0]:GPU available: False, used: False\n", + "2023-04-04T21:07:20.356947480Z [0]:TPU available: False, using: 0 TPU cores\n", + "2023-04-04T21:07:20.356947480Z [0]:IPU available: False, using: 0 IPUs\n", + "2023-04-04T21:07:20.557348006Z [0]:\n", + "2023-04-04T21:07:20.657500757Z [0]: 0%| | 0/9912422 [00:00 Date: Tue, 4 Apr 2023 17:46:17 -0400 Subject: [PATCH 3/4] Removed personal cluster mention --- demo-notebooks/batch-job/batch_mnist_mcad.ipynb | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/demo-notebooks/batch-job/batch_mnist_mcad.ipynb b/demo-notebooks/batch-job/batch_mnist_mcad.ipynb index bfd25d3ab..b8e03ce8a 100644 --- a/demo-notebooks/batch-job/batch_mnist_mcad.ipynb +++ b/demo-notebooks/batch-job/batch_mnist_mcad.ipynb @@ -3338,21 +3338,10 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "id": "0d41b90e", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Logged \"kube:admin\" out on \"https://api.meyceoz-032023.psap.aws.rhperfscale.org:6443\"\\n'" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "auth.logout()" ] From 2955b0c195e2500bea9d2bc76139ffd6b968d194 Mon Sep 17 00:00:00 2001 From: maxusmusti Date: Mon, 10 Apr 2023 12:24:40 -0400 Subject: [PATCH 4/4] Review feedback --- .../batch-job/batch_mnist_mcad.ipynb | 25 ++++++------- .../batch-job/batch_mnist_ray.ipynb | 35 +++++++++++-------- 2 files changed, 34 insertions(+), 26 deletions(-) diff --git a/demo-notebooks/batch-job/batch_mnist_mcad.ipynb b/demo-notebooks/batch-job/batch_mnist_mcad.ipynb index b8e03ce8a..ada207359 100644 --- a/demo-notebooks/batch-job/batch_mnist_mcad.ipynb +++ b/demo-notebooks/batch-job/batch_mnist_mcad.ipynb @@ -8,7 +8,8 @@ "outputs": [], "source": [ "# Import pieces from codeflare-sdk\n", - "from codeflare_sdk.cluster.auth import TokenAuthentication" + "from codeflare_sdk.cluster.auth import TokenAuthentication\n", + "from codeflare_sdk.job.jobs import DDPJobDefinition" ] }, { @@ -45,16 +46,6 @@ "Now that we are logged in, we can directly submit our batch job (model training on two workers with four gpus each) to MCAD via torchx." ] }, - { - "cell_type": "code", - "execution_count": 5, - "id": "3cc6183a-8f6e-4347-af91-d088ed422544", - "metadata": {}, - "outputs": [], - "source": [ - "from codeflare_sdk.job.jobs import DDPJobDefinition" - ] - }, { "cell_type": "code", "execution_count": 6, @@ -70,7 +61,17 @@ } ], "source": [ - "job = DDPJobDefinition(name=\"mnistjob\", script=\"mnist.py\", scheduler_args={\"namespace\": \"default\"}, j=\"1x1\", gpu=0, cpu=1, memMB=8000, image=\"quay.io/michaelclifford/mnist-test:latest\").submit()" + "job = DDPJobDefinition(\n", + " name=\"mnistjob\",\n", + " script=\"mnist.py\",\n", + " scheduler_args={\"namespace\": \"default\"},\n", + " j=\"1x1\",\n", + " gpu=0,\n", + " cpu=1,\n", + " memMB=8000,\n", + " image=\"quay.io/michaelclifford/mnist-test:latest\"\n", + ")\n", + "job.submit()" ] }, { diff --git a/demo-notebooks/batch-job/batch_mnist_ray.ipynb b/demo-notebooks/batch-job/batch_mnist_ray.ipynb index 1133fec4f..cedf90a2b 100644 --- a/demo-notebooks/batch-job/batch_mnist_ray.ipynb +++ b/demo-notebooks/batch-job/batch_mnist_ray.ipynb @@ -9,7 +9,8 @@ "source": [ "# Import pieces from codeflare-sdk\n", "from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration\n", - "from codeflare_sdk.cluster.auth import TokenAuthentication" + "from codeflare_sdk.cluster.auth import TokenAuthentication\n", + "from codeflare_sdk.job.jobs import DDPJobDefinition" ] }, { @@ -51,8 +52,20 @@ } ], "source": [ - "# Create our cluster and submit appwrapper\n", - "cluster = Cluster(ClusterConfiguration(name='mnisttest', min_worker=2, max_worker=2, min_cpus=8, max_cpus=8, min_memory=16, max_memory=16, gpu=4, instascale=True, machine_types=[\"m5.xlarge\", \"p3.8xlarge\"]))" + "# Create our cluster and submit appwrapper (reduce specs as desired)\n", + "cluster = Cluster(ClusterConfiguration(\n", + " name='mnisttest',\n", + " namespace='default',\n", + " min_worker=2,\n", + " max_worker=2,\n", + " min_cpus=8,\n", + " max_cpus=8,\n", + " min_memory=16,\n", + " max_memory=16,\n", + " gpu=4,\n", + " instascale=True, # Can be set to false if scaling not needed\n", + " machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"] # Can be removed if above is false\n", + "))" ] }, { @@ -293,16 +306,6 @@ "Now that our resource cluster is ready, we can directly submit our batch job (model training on two workers with four gpus each) to the cluster via torchx." ] }, - { - "cell_type": "code", - "execution_count": 9, - "id": "3cc6183a-8f6e-4347-af91-d088ed422544", - "metadata": {}, - "outputs": [], - "source": [ - "from codeflare_sdk.job.jobs import DDPJobDefinition" - ] - }, { "cell_type": "code", "execution_count": 10, @@ -320,7 +323,11 @@ } ], "source": [ - "job = DDPJobDefinition(script=\"mnist.py\", scheduler_args={\"requirements\": \"requirements.txt\"}).submit(cluster)" + "job = DDPJobDefinition(\n", + " script=\"mnist.py\",\n", + " scheduler_args={\"requirements\": \"requirements.txt\"}\n", + ")\n", + "job.submit(cluster)" ] }, {