diff --git a/demo-notebooks/batch-job/batch_mnist.ipynb b/demo-notebooks/batch-job/batch_mnist.ipynb deleted file mode 100644 index 4d434640d..000000000 --- a/demo-notebooks/batch-job/batch_mnist.ipynb +++ /dev/null @@ -1,349 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a", - "metadata": {}, - "outputs": [], - "source": [ - "# Import pieces from codeflare-sdk\n", - "from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration\n", - "from codeflare_sdk.cluster.auth import TokenAuthentication" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "614daa0c", - "metadata": {}, - "outputs": [], - "source": [ - "# Create authentication object for oc user permissions\n", - "auth = TokenAuthentication(\n", - " token = \"XXXX\",\n", - " server = \"XXXX\",\n", - " skip_tls=True\n", - ")\n", - "auth.login()" - ] - }, - { - "cell_type": "markdown", - "id": "bc27f84c", - "metadata": {}, - "source": [ - "Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding AppWrapper)." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "0f4bc870-091f-4e11-9642-cba145710159", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Written to: mnisttest.yaml\n" - ] - } - ], - "source": [ - "# Create our cluster and submit appwrapper\n", - "cluster = Cluster(ClusterConfiguration(name='mnisttest', min_worker=2, max_worker=2, min_cpus=8, max_cpus=8, min_memory=16, max_memory=16, gpu=4, instascale=True, machine_types=[\"m5.xlarge\", \"p3.8xlarge\"]))" - ] - }, - { - "cell_type": "markdown", - "id": "12eef53c", - "metadata": {}, - "source": [ - "Next, we want to bring our cluster up, so we call the `up()` function below to submit our cluster AppWrapper yaml onto the MCAD queue, and begin the process of obtaining our resource cluster." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "f0884bbc-c224-4ca0-98a0-02dfa09c2200", - "metadata": {}, - "outputs": [], - "source": [ - "# Bring up the cluster\n", - "cluster.up()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "657ebdfb", - "metadata": {}, - "source": [ - "Now, we want to check on the status of our resource cluster, and wait until it is finally ready for use." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "3c1b4311-2e61-44c9-8225-87c2db11363d", - "metadata": {}, - "outputs": [ - { - "ename": "AttributeError", - "evalue": "'Cluster' object has no attribute 'is_ready'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[11], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mcluster\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mis_ready\u001b[49m()\n", - "\u001b[0;31mAttributeError\u001b[0m: 'Cluster' object has no attribute 'is_ready'" - ] - } - ], - "source": [ - "cluster.status()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a99d5aff", - "metadata": {}, - "outputs": [], - "source": [ - "cluster.wait_ready()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "df71c1ed", - "metadata": {}, - "outputs": [], - "source": [ - "cluster.status()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "b3a55fe4", - "metadata": {}, - "source": [ - "Let's quickly verify that the specs of the cluster are as expected." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "7fd45bc5-03c0-4ae5-9ec5-dd1c30f1a084", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
                   ๐Ÿš€ CodeFlare Cluster Status ๐Ÿš€                   \n",
-       "                                                                    \n",
-       " โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ \n",
-       " โ”‚   Name                                                         โ”‚ \n",
-       " โ”‚   mnisttest                                        Active โœ…   โ”‚ \n",
-       " โ”‚                                                                โ”‚ \n",
-       " โ”‚   URI: ray://mnisttest-head-svc.default.svc:10001              โ”‚ \n",
-       " โ”‚                                                                โ”‚ \n",
-       " โ”‚   Dashboard๐Ÿ”—                                                  โ”‚ \n",
-       " โ”‚                                                                โ”‚ \n",
-       " โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ \n",
-       "
\n" - ], - "text/plain": [ - "\u001b[3m \u001b[0m\u001b[1;3m ๐Ÿš€ CodeFlare Cluster Status ๐Ÿš€\u001b[0m\u001b[3m \u001b[0m\n", - "\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\n", - " โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ \n", - " โ”‚ \u001b[1;37;42mName\u001b[0m โ”‚ \n", - " โ”‚ \u001b[1;4mmnisttest\u001b[0m Active โœ… โ”‚ \n", - " โ”‚ โ”‚ \n", - " โ”‚ \u001b[1mURI:\u001b[0m ray://mnisttest-head-svc.default.svc:10001 โ”‚ \n", - " โ”‚ โ”‚ \n", - " โ”‚ \u001b]8;id=464037;ray-dashboard-mnisttest-default.apps.kpostoffice.dev.datahub.redhat.com\u001b\\\u001b[4;34mDashboard๐Ÿ”—\u001b[0m\u001b]8;;\u001b\\ โ”‚ \n", - " โ”‚ โ”‚ \n", - " โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ \n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "(, True)" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cluster.details()" - ] - }, - { - "cell_type": "markdown", - "id": "87d2c9b3", - "metadata": {}, - "source": [ - "Now that our resource cluster is ready, we can directly submit our batch job (model training on two workers with four gpus each) to the cluster via torchx." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "3cc6183a-8f6e-4347-af91-d088ed422544", - "metadata": {}, - "outputs": [], - "source": [ - "from codeflare_sdk.job.jobs import DDPJobDefinition" - ] - }, - { - "cell_type": "markdown", - "id": "ff065051", - "metadata": {}, - "source": [ - "Now we can go ahead and look at the status and logs of our batch job." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "ced6ccd6-a17e-413a-a0e4-65004fc35463", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "The Ray scheduler does not support port mapping.\n" - ] - } - ], - "source": [ - "job = DDPJobDefinition(script=\"mnist.py\", scheduler_args={\"requirements\": \"requirements.txt\"}).submit(cluster)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "e5c0b0da-c22e-4142-b096-407ac8aebe5e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AppStatus:\n", - " msg: !!python/object/apply:ray.dashboard.modules.job.common.JobStatus\n", - " - FAILED\n", - " num_restarts: -1\n", - " roles:\n", - " - replicas:\n", - " - hostname: \n", - " id: 0\n", - " role: ray\n", - " state: !!python/object/apply:torchx.specs.api.AppState\n", - " - 5\n", - " structured_error_msg: \n", - " role: ray\n", - " state: FAILED (5)\n", - " structured_error_msg: \n", - " ui_url: null" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "job.status()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "264c1809-de72-4acf-b0f6-e67d345640f6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'[RayActor(name=\\'mnist\\', command=[\\'bash\\', \\'-c\\', \"python -m torch.distributed.run --rdzv_backend static --rdzv_endpoint $TORCHX_RANK0_HOST:49782 --rdzv_id \\'mnist-nm426236fqknz\\' --nnodes 2 --nproc_per_node 1 --node_rank \\'0\\' --tee 3 --role \\'\\' mnist.py\"], env={\\'LOGLEVEL\\': \\'DEBUG\\', \\'TORCH_DISTRIBUTED_DEBUG\\': \\'DETAIL\\', \\'TORCHX_JOB_ID\\': \\'ray://torchx/mnist-nm426236fqknz\\'}, num_cpus=1, num_gpus=0, min_replicas=2), RayActor(name=\\'mnist\\', command=[\\'bash\\', \\'-c\\', \"python -m torch.distributed.run --rdzv_backend static --rdzv_endpoint $TORCHX_RANK0_HOST:49782 --rdzv_id \\'mnist-nm426236fqknz\\' --nnodes 2 --nproc_per_node 1 --node_rank \\'1\\' --tee 3 --role \\'\\' mnist.py\"], env={\\'LOGLEVEL\\': \\'DEBUG\\', \\'TORCH_DISTRIBUTED_DEBUG\\': \\'DETAIL\\', \\'TORCHX_JOB_ID\\': \\'ray://torchx/mnist-nm426236fqknz\\'}, num_cpus=1, num_gpus=0, min_replicas=2)]\\n2023-03-03 13:08:48,406\\tINFO worker.py:1230 -- Using address 10.129.2.222:6379 set in the environment variable RAY_ADDRESS\\n2023-03-03 13:08:48,406\\tINFO worker.py:1342 -- Connecting to existing Ray cluster at address: 10.129.2.222:6379...\\n2023-03-03 13:08:48,413\\tINFO worker.py:1519 -- Connected to Ray cluster. View the dashboard at \\x1b[1m\\x1b[32mhttp://10.129.2.222:8265 \\x1b[39m\\x1b[22m\\nWaiting for minimum placement group to start.\\nSuccessfully created placement groups\\nSuccessfully placed command actors\\nEntering main loop, start executing the script on worker nodes\\nrunning ray.wait on [ObjectRef(4482c0d3e15a41a806094fedcf394a8e91a4a2b10500000001000000), ObjectRef(9f79440f8f098da04ea4d79eb7ab2872e23b3c2b0500000001000000)]\\nrunning ray.wait on [ObjectRef(4482c0d3e15a41a806094fedcf394a8e91a4a2b10500000001000000), ObjectRef(d980cb79d3ebbeee4ea4d79eb7ab2872e23b3c2b0500000001000000)]\\nTraceback (most recent call last):\\n File \"ray_driver.py\", line 312, in \\n main()\\n File \"ray_driver.py\", line 308, in main\\n driver.run()\\n File \"ray_driver.py\", line 293, in run\\n terminal = self._step()\\n File \"ray_driver.py\", line 245, in _step\\n result = ray.get(object_ref)\\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/client_mode_hook.py\", line 105, in wrapper\\n return func(*args, **kwargs)\\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/worker.py\", line 2289, in get\\n raise value.as_instanceof_cause()\\nray.exceptions.RayTaskError(RuntimeError): \\x1b[36mray::CommandActor.exec_module()\\x1b[39m (pid=3315, ip=10.129.2.222, repr=)\\n File \"ray_driver.py\", line 76, in exec_module\\n raise RuntimeError(\\nRuntimeError: Either MASTER_ADDR or MASTER_PORT are not set. This is most likely bug in torchxOpen issue at https://github.com/pytorch/torchx\\n'" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "print(job.logs())" - ] - }, - { - "cell_type": "markdown", - "id": "5af8cd32", - "metadata": {}, - "source": [ - "Finally, we bring our resource cluster down and release/terminate the associated resources, bringing everything back to the way it was before our cluster was brought up." - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "5f36db0f-31f6-4373-9503-dc3c1c4c3f57", - "metadata": {}, - "outputs": [], - "source": [ - "cluster.down()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0d41b90e", - "metadata": {}, - "outputs": [], - "source": [ - "auth.logout()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.13" - }, - "vscode": { - "interpreter": { - "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/demo-notebooks/batch-job/batch_mnist_mcad.ipynb b/demo-notebooks/batch-job/batch_mnist_mcad.ipynb new file mode 100644 index 000000000..ada207359 --- /dev/null +++ b/demo-notebooks/batch-job/batch_mnist_mcad.ipynb @@ -0,0 +1,3377 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a", + "metadata": {}, + "outputs": [], + "source": [ + "# Import pieces from codeflare-sdk\n", + "from codeflare_sdk.cluster.auth import TokenAuthentication\n", + "from codeflare_sdk.job.jobs import DDPJobDefinition" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "9de86658", + "metadata": {}, + "source": [ + "First, we begin by authenticating using the SDK." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "614daa0c", + "metadata": {}, + "outputs": [], + "source": [ + "# Create authentication object for oc user permissions\n", + "auth = TokenAuthentication(\n", + " token = \"XXXXX\",\n", + " server = \"XXXXX\",\n", + " skip_tls=True\n", + ")\n", + "auth.login()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "87d2c9b3", + "metadata": {}, + "source": [ + "Now that we are logged in, we can directly submit our batch job (model training on two workers with four gpus each) to MCAD via torchx." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ced6ccd6-a17e-413a-a0e4-65004fc35463", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [] + } + ], + "source": [ + "job = DDPJobDefinition(\n", + " name=\"mnistjob\",\n", + " script=\"mnist.py\",\n", + " scheduler_args={\"namespace\": \"default\"},\n", + " j=\"1x1\",\n", + " gpu=0,\n", + " cpu=1,\n", + " memMB=8000,\n", + " image=\"quay.io/michaelclifford/mnist-test:latest\"\n", + ")\n", + "job.submit()" + ] + }, + { + "cell_type": "markdown", + "id": "ff065051", + "metadata": {}, + "source": [ + "Now we can go ahead and look at the status and logs of our batch job." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e5c0b0da-c22e-4142-b096-407ac8aebe5e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/meyceoz/Documents/sdktest/lib64/python3.8/site-packages/torchx/schedulers/kubernetes_mcad_scheduler.py:1047: UserWarning: Warning - MCAD does not report individual replica statuses, but overall task status. Replica id may not match status\n", + " warnings.warn(msg)\n" + ] + }, + { + "data": { + "text/plain": [ + "AppStatus:\n", + " msg: \n", + " num_restarts: -1\n", + " roles:\n", + " - replicas:\n", + " - hostname: ''\n", + " id: 0\n", + " role: mnist\n", + " state: !!python/object/apply:torchx.specs.api.AppState\n", + " - 3\n", + " structured_error_msg: \n", + " role: mnist\n", + " state: RUNNING (3)\n", + " structured_error_msg: \n", + " ui_url: null" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "job.status()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "264c1809-de72-4acf-b0f6-e67d345640f6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-04-04T21:07:18.216609937Z INFO:torch.distributed.launcher.api:Starting elastic_operator with launch configs:\n", + "2023-04-04T21:07:18.216609937Z entrypoint : mnist.py\n", + "2023-04-04T21:07:18.216609937Z min_nodes : 1\n", + "2023-04-04T21:07:18.216609937Z max_nodes : 1\n", + "2023-04-04T21:07:18.216609937Z nproc_per_node : 1\n", + "2023-04-04T21:07:18.216609937Z run_id : mnistjob-d6hz7lmsvx4scd\n", + "2023-04-04T21:07:18.216609937Z rdzv_backend : static\n", + "2023-04-04T21:07:18.216609937Z rdzv_endpoint : localhost:49782\n", + "2023-04-04T21:07:18.216609937Z rdzv_configs : {'rank': 0, 'timeout': 900}\n", + "2023-04-04T21:07:18.216609937Z max_restarts : 0\n", + "2023-04-04T21:07:18.216609937Z monitor_interval : 5\n", + "2023-04-04T21:07:18.216609937Z log_dir : None\n", + "2023-04-04T21:07:18.216609937Z metrics_cfg : {}\n", + "2023-04-04T21:07:18.216609937Z \n", + "2023-04-04T21:07:18.217434715Z INFO:torch.distributed.elastic.agent.server.local_elastic_agent:log directory set to: /tmp/torchelastic_5_zjjbf1/mnistjob-d6hz7lmsvx4scd_a107om0q\n", + "2023-04-04T21:07:18.217476057Z INFO:torch.distributed.elastic.agent.server.api:[] starting workers for entrypoint: python\n", + "2023-04-04T21:07:18.217498330Z INFO:torch.distributed.elastic.agent.server.api:[] Rendezvous'ing worker group\n", + "2023-04-04T21:07:18.218861938Z INFO:torch.distributed.elastic.agent.server.api:[] Rendezvous complete for workers. Result:\n", + "2023-04-04T21:07:18.218861938Z restart_count=0\n", + "2023-04-04T21:07:18.218861938Z master_addr=localhost\n", + "2023-04-04T21:07:18.218861938Z master_port=49782\n", + "2023-04-04T21:07:18.218861938Z group_rank=0\n", + "2023-04-04T21:07:18.218861938Z group_world_size=1\n", + "2023-04-04T21:07:18.218861938Z local_ranks=[0]\n", + "2023-04-04T21:07:18.218861938Z role_ranks=[0]\n", + "2023-04-04T21:07:18.218861938Z global_ranks=[0]\n", + "2023-04-04T21:07:18.218861938Z role_world_sizes=[1]\n", + "2023-04-04T21:07:18.218861938Z global_world_sizes=[1]\n", + "2023-04-04T21:07:18.218861938Z \n", + "2023-04-04T21:07:18.218890288Z INFO:torch.distributed.elastic.agent.server.api:[] Starting worker group\n", + "2023-04-04T21:07:18.219122325Z INFO:torch.distributed.elastic.agent.server.local_elastic_agent:Environment variable 'TORCHELASTIC_ENABLE_FILE_TIMER' not found. Do not start FileTimerServer.\n", + "2023-04-04T21:07:18.219220551Z INFO:torch.distributed.elastic.multiprocessing:Setting worker0 reply file to: /tmp/torchelastic_5_zjjbf1/mnistjob-d6hz7lmsvx4scd_a107om0q/attempt_0/0/error.json\n", + "2023-04-04T21:07:20.356947480Z [0]:GPU available: False, used: False\n", + "2023-04-04T21:07:20.356947480Z [0]:TPU available: False, using: 0 TPU cores\n", + "2023-04-04T21:07:20.356947480Z [0]:IPU available: False, using: 0 IPUs\n", + "2023-04-04T21:07:20.557348006Z [0]:\n", + "2023-04-04T21:07:20.657500757Z [0]: 0%| | 0/9912422 [00:00โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ\n", + "โ”‚ ๐Ÿš€ Cluster Queue โ”‚\n", + "โ”‚ Status ๐Ÿš€ โ”‚\n", + "โ”‚ +-----------+---------+ โ”‚\n", + "โ”‚ | Name | Status | โ”‚\n", + "โ”‚ +===========+=========+ โ”‚\n", + "โ”‚ | mnisttest | pending | โ”‚\n", + "โ”‚ | | | โ”‚\n", + "โ”‚ +-----------+---------+ โ”‚\n", + "โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ\n", + "\n" + ], + "text/plain": [ + "โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ\n", + "โ”‚ \u001b[3m \u001b[0m\u001b[1;3m ๐Ÿš€ Cluster Queue\u001b[0m\u001b[3m \u001b[0m โ”‚\n", + "โ”‚ \u001b[3m \u001b[0m\u001b[1;3mStatus ๐Ÿš€\u001b[0m\u001b[3m \u001b[0m โ”‚\n", + "โ”‚ +-----------+---------+ โ”‚\n", + "โ”‚ |\u001b[1m \u001b[0m\u001b[1mName \u001b[0m\u001b[1m \u001b[0m|\u001b[1m \u001b[0m\u001b[1mStatus \u001b[0m\u001b[1m \u001b[0m| โ”‚\n", + "โ”‚ +===========+=========+ โ”‚\n", + "โ”‚ |\u001b[36m \u001b[0m\u001b[36mmnisttest\u001b[0m\u001b[36m \u001b[0m|\u001b[35m \u001b[0m\u001b[35mpending\u001b[0m\u001b[35m \u001b[0m| โ”‚\n", + "โ”‚ |\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m|\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m| โ”‚\n", + "โ”‚ +-----------+---------+ โ”‚\n", + "โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "(, False)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cluster.status()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a99d5aff", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Waiting for requested resources to be set up...\n", + "Requested cluster up and running!\n" + ] + } + ], + "source": [ + "cluster.wait_ready()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "df71c1ed", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
                   ๐Ÿš€ CodeFlare Cluster Status ๐Ÿš€                   \n",
+       "                                                                    \n",
+       " โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ \n",
+       " โ”‚   Name                                                         โ”‚ \n",
+       " โ”‚   mnisttest                                        Active โœ…   โ”‚ \n",
+       " โ”‚                                                                โ”‚ \n",
+       " โ”‚   URI: ray://mnisttest-head-svc.default.svc:10001              โ”‚ \n",
+       " โ”‚                                                                โ”‚ \n",
+       " โ”‚   Dashboard๐Ÿ”—                                                  โ”‚ \n",
+       " โ”‚                                                                โ”‚ \n",
+       " โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m \u001b[0m\u001b[1;3m ๐Ÿš€ CodeFlare Cluster Status ๐Ÿš€\u001b[0m\u001b[3m \u001b[0m\n", + "\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\n", + " โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ \n", + " โ”‚ \u001b[1;37;42mName\u001b[0m โ”‚ \n", + " โ”‚ \u001b[1;4mmnisttest\u001b[0m Active โœ… โ”‚ \n", + " โ”‚ โ”‚ \n", + " โ”‚ \u001b[1mURI:\u001b[0m ray://mnisttest-head-svc.default.svc:10001 โ”‚ \n", + " โ”‚ โ”‚ \n", + " โ”‚ \u001b]8;id=790579;ray-dashboard-mnisttest-default.apps.meyceoz-032023.psap.aws.rhperfscale.org\u001b\\\u001b[4;34mDashboard๐Ÿ”—\u001b[0m\u001b]8;;\u001b\\ โ”‚ \n", + " โ”‚ โ”‚ \n", + " โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "(, True)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cluster.status()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "b3a55fe4", + "metadata": {}, + "source": [ + "Let's quickly verify that the specs of the cluster are as expected." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7fd45bc5-03c0-4ae5-9ec5-dd1c30f1a084", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
                   ๐Ÿš€ CodeFlare Cluster Details ๐Ÿš€                  \n",
+       "                                                                    \n",
+       " โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ \n",
+       " โ”‚   Name                                                         โ”‚ \n",
+       " โ”‚   mnisttest                                        Active โœ…   โ”‚ \n",
+       " โ”‚                                                                โ”‚ \n",
+       " โ”‚   URI: ray://mnisttest-head-svc.default.svc:10001              โ”‚ \n",
+       " โ”‚                                                                โ”‚ \n",
+       " โ”‚   Dashboard๐Ÿ”—                                                  โ”‚ \n",
+       " โ”‚                                                                โ”‚ \n",
+       " โ”‚                      Cluster Resources                         โ”‚ \n",
+       " โ”‚   โ•ญโ”€ Workers โ”€โ”€โ•ฎ  โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Worker specs(each) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ     โ”‚ \n",
+       " โ”‚   โ”‚  Min  Max  โ”‚  โ”‚  Memory      CPU         GPU         โ”‚     โ”‚ \n",
+       " โ”‚   โ”‚            โ”‚  โ”‚                                      โ”‚     โ”‚ \n",
+       " โ”‚   โ”‚  2    2    โ”‚  โ”‚  8~8         2           0           โ”‚     โ”‚ \n",
+       " โ”‚   โ”‚            โ”‚  โ”‚                                      โ”‚     โ”‚ \n",
+       " โ”‚   โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ  โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ     โ”‚ \n",
+       " โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m \u001b[0m\u001b[1;3m ๐Ÿš€ CodeFlare Cluster Details ๐Ÿš€\u001b[0m\u001b[3m \u001b[0m\n", + "\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\n", + " โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ \n", + " โ”‚ \u001b[1;37;42mName\u001b[0m โ”‚ \n", + " โ”‚ \u001b[1;4mmnisttest\u001b[0m Active โœ… โ”‚ \n", + " โ”‚ โ”‚ \n", + " โ”‚ \u001b[1mURI:\u001b[0m ray://mnisttest-head-svc.default.svc:10001 โ”‚ \n", + " โ”‚ โ”‚ \n", + " โ”‚ \u001b]8;id=84168;http://ray-dashboard-mnisttest-default.apps.meyceoz-032023.psap.aws.rhperfscale.org\u001b\\\u001b[4;34mDashboard๐Ÿ”—\u001b[0m\u001b]8;;\u001b\\ โ”‚ \n", + " โ”‚ โ”‚ \n", + " โ”‚ \u001b[3m Cluster Resources \u001b[0m โ”‚ \n", + " โ”‚ โ•ญโ”€ Workers โ”€โ”€โ•ฎ โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Worker specs(each) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ โ”‚ \n", + " โ”‚ โ”‚ \u001b[1m \u001b[0m\u001b[1mMin\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mMax\u001b[0m\u001b[1m \u001b[0m โ”‚ โ”‚ \u001b[1m \u001b[0m\u001b[1mMemory \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mCPU \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mGPU \u001b[0m\u001b[1m \u001b[0m โ”‚ โ”‚ \n", + " โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", + " โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m2 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m2 \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m8~8 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m2 \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m0 \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", + " โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", + " โ”‚ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ”‚ \n", + " โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "RayCluster(name='mnisttest', status=, min_workers=2, max_workers=2, worker_mem_min=8, worker_mem_max=8, worker_cpu=2, worker_gpu=0, namespace='default', dashboard='http://ray-dashboard-mnisttest-default.apps.meyceoz-032023.psap.aws.rhperfscale.org')" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cluster.details()" + ] + }, + { + "cell_type": "markdown", + "id": "87d2c9b3", + "metadata": {}, + "source": [ + "Now that our resource cluster is ready, we can directly submit our batch job (model training on two workers with four gpus each) to the cluster via torchx." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ced6ccd6-a17e-413a-a0e4-65004fc35463", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The Ray scheduler does not support port mapping.\n" + ] + } + ], + "source": [ + "job = DDPJobDefinition(\n", + " script=\"mnist.py\",\n", + " scheduler_args={\"requirements\": \"requirements.txt\"}\n", + ")\n", + "job.submit(cluster)" + ] + }, + { + "cell_type": "markdown", + "id": "ff065051", + "metadata": {}, + "source": [ + "Now we can go ahead and look at the status and logs of our batch job." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "e5c0b0da-c22e-4142-b096-407ac8aebe5e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AppStatus:\n", + " msg: !!python/object/apply:ray.dashboard.modules.job.common.JobStatus\n", + " - RUNNING\n", + " num_restarts: -1\n", + " roles:\n", + " - replicas:\n", + " - hostname: \n", + " id: 0\n", + " role: ray\n", + " state: !!python/object/apply:torchx.specs.api.AppState\n", + " - 3\n", + " structured_error_msg: \n", + " role: ray\n", + " state: RUNNING (3)\n", + " structured_error_msg: \n", + " ui_url: null" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "job.status()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "264c1809-de72-4acf-b0f6-e67d345640f6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[RayActor(name='mnist', command=['bash', '-c', \"python -m torch.distributed.run --rdzv_backend static --rdzv_endpoint $TORCHX_RANK0_HOST:49782 --rdzv_id 'mnist-zvm96dmvgkq5hc' --nnodes 2 --nproc_per_node 1 --node_rank '0' --tee 3 --role '' mnist.py\"], env={'LOGLEVEL': 'DEBUG', 'TORCH_DISTRIBUTED_DEBUG': 'DETAIL', 'TORCHX_JOB_ID': 'ray://torchx/mnist-zvm96dmvgkq5hc'}, num_cpus=2, num_gpus=0, min_replicas=2), RayActor(name='mnist', command=['bash', '-c', \"python -m torch.distributed.run --rdzv_backend static --rdzv_endpoint $TORCHX_RANK0_HOST:49782 --rdzv_id 'mnist-zvm96dmvgkq5hc' --nnodes 2 --nproc_per_node 1 --node_rank '1' --tee 3 --role '' mnist.py\"], env={'LOGLEVEL': 'DEBUG', 'TORCH_DISTRIBUTED_DEBUG': 'DETAIL', 'TORCHX_JOB_ID': 'ray://torchx/mnist-zvm96dmvgkq5hc'}, num_cpus=2, num_gpus=0, min_replicas=2)]\n", + "2023-04-03 14:55:18,399\tINFO worker.py:1230 -- Using address 10.129.0.91:6379 set in the environment variable RAY_ADDRESS\n", + "2023-04-03 14:55:18,399\tINFO worker.py:1342 -- Connecting to existing Ray cluster at address: 10.129.0.91:6379...\n", + "2023-04-03 14:55:18,404\tINFO worker.py:1519 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttp://10.129.0.91:8265 \u001b[39m\u001b[22m\n", + "Waiting for minimum placement group to start.\n", + "Successfully created placement groups\n", + "rdzv_endpoint set to 10.129.0.93 for actor d45df6656fb9ae4d54e9266f02000000\n", + "rdzv_endpoint set to 10.129.0.93 for actor 1e1ce76b4ad62b80f89134cb02000000\n", + "Successfully placed command actors\n", + "Entering main loop, start executing the script on worker nodes\n", + "running ray.wait on [ObjectRef(e082c90ab8422b00d45df6656fb9ae4d54e9266f0200000001000000), ObjectRef(ce868e48e2fa9a941e1ce76b4ad62b80f89134cb0200000001000000)]\n", + "running ray.wait on [ObjectRef(ce868e48e2fa9a941e1ce76b4ad62b80f89134cb0200000001000000), ObjectRef(f81ec6ff838b16dbd45df6656fb9ae4d54e9266f0200000001000000)]\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m INFO:torch.distributed.launcher.api:Starting elastic_operator with launch configs:\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m entrypoint : mnist.py\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m min_nodes : 2\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m max_nodes : 2\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m nproc_per_node : 1\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m run_id : mnist-zvm96dmvgkq5hc\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m rdzv_backend : static\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m rdzv_endpoint : 10.129.0.93:49782\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m rdzv_configs : {'rank': 0, 'timeout': 900}\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m max_restarts : 0\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m monitor_interval : 5\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m log_dir : None\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m metrics_cfg : {}\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m \n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m INFO:torch.distributed.elastic.agent.server.local_elastic_agent:log directory set to: /tmp/torchelastic_pj6rj6_8/mnist-zvm96dmvgkq5hc_c86x83a_\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] starting workers for entrypoint: python\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Rendezvous'ing worker group\n", + "running ray.wait on [ObjectRef(f81ec6ff838b16dbd45df6656fb9ae4d54e9266f0200000001000000), ObjectRef(32b0eec39cfa87ac1e1ce76b4ad62b80f89134cb0200000001000000)]\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m INFO:torch.distributed.launcher.api:Starting elastic_operator with launch configs:\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m entrypoint : mnist.py\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m min_nodes : 2\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m max_nodes : 2\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m nproc_per_node : 1\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m run_id : mnist-zvm96dmvgkq5hc\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m rdzv_backend : static\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m rdzv_endpoint : 10.129.0.93:49782\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m rdzv_configs : {'rank': 1, 'timeout': 900}\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m max_restarts : 0\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m monitor_interval : 5\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m log_dir : None\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m metrics_cfg : {}\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m \n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m INFO:torch.distributed.elastic.agent.server.local_elastic_agent:log directory set to: /tmp/torchelastic_l3wjw627/mnist-zvm96dmvgkq5hc_vpc42a2t\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] starting workers for entrypoint: python\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Rendezvous'ing worker group\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Rendezvous complete for workers. Result:\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m restart_count=0\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m master_addr=10.129.0.93\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m master_port=49782\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m group_rank=1\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m group_world_size=2\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m local_ranks=[0]\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m role_ranks=[1]\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m global_ranks=[1]\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m role_world_sizes=[2]\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m global_world_sizes=[2]\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m \n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Starting worker group\n", + "\u001b[2m\u001b[36m(CommandActor pid=358)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker0 reply file to: /tmp/torchelastic_l3wjw627/mnist-zvm96dmvgkq5hc_vpc42a2t/attempt_0/0/error.json\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Rendezvous complete for workers. Result:\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m restart_count=0\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m master_addr=10.129.0.93\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m master_port=49782\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m group_rank=0\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m group_world_size=2\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m local_ranks=[0]\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m role_ranks=[0]\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m global_ranks=[0]\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m role_world_sizes=[2]\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m global_world_sizes=[2]\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m \n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Starting worker group\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker0 reply file to: /tmp/torchelastic_pj6rj6_8/mnist-zvm96dmvgkq5hc_c86x83a_/attempt_0/0/error.json\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:prior to running the trainer\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:MASTER_ADDR: is 10.129.0.93\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:MASTER_PORT: is 49782\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:GROUP: 2\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:LOCAL: 1\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Downloading MNIST dataset...\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./MNIST/raw/train-images-idx3-ubyte.gz\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Extracting ./MNIST/raw/train-images-idx3-ubyte.gz to ./MNIST/raw\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./MNIST/raw/train-labels-idx1-ubyte.gz\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Extracting ./MNIST/raw/train-labels-idx1-ubyte.gz to ./MNIST/raw\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./MNIST/raw/t10k-images-idx3-ubyte.gz\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Extracting ./MNIST/raw/t10k-images-idx3-ubyte.gz to ./MNIST/raw\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./MNIST/raw/t10k-labels-idx1-ubyte.gz\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Extracting ./MNIST/raw/t10k-labels-idx1-ubyte.gz to ./MNIST/raw\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Validation sanity check: 0it [00:00, ?it/s]\n", + "\u001b[2m\u001b[36m(CommandActor pid=145, ip=10.129.0.93)\u001b[0m [0]:Validation sanity check: 0%| | 0/2 [00:00