diff --git a/custom-nb-image/Dockerfile b/custom-nb-image/Dockerfile new file mode 100644 index 000000000..fdaeb96d1 --- /dev/null +++ b/custom-nb-image/Dockerfile @@ -0,0 +1,33 @@ +# Copyright 2022 IBM, Red Hat +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM quay.io/thoth-station/s2i-minimal-py38-notebook:latest + +# Install: torch (v1.12), ray (v2.1.0) and others + +COPY requirements.txt requirements.txt + +RUN pip install -r requirements.txt + +RUN pip uninstall pickle5 -y + +# Pull notebooks in +ADD codeflare/ /home/codeflare + +# Install codeflare-cli and other libraries +RUN pip install codeflare-sdk==0.2.1 \ + datasets==2.6.1 \ + transformers==4.23.1 \ + evaluate==0.3.0 \ + git+https://github.com/MichaelClifford/torchx.git@ray2-patch diff --git a/custom-nb-image/imagestream.yaml b/custom-nb-image/imagestream.yaml new file mode 100644 index 000000000..10ded6b07 --- /dev/null +++ b/custom-nb-image/imagestream.yaml @@ -0,0 +1,39 @@ +# Copyright 2022 IBM, Red Hat +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: ImageStream +apiVersion: image.openshift.io/v1 +metadata: + name: codeflare-notebook + labels: + opendatahub.io/notebook-image: 'true' + annotations: + opendatahub.io/notebook-image-name: + "Codeflare Notebook" + opendatahub.io/notebook-image-desc: "Custom Jupyter notebook image with codeflare SDK, Python 3.8, Ray 2.1.0 and PyTorch 1.12.1" +spec: + lookupPolicy: + local: true + tags: + - annotations: + openshift.io/imported-from: quay.io/project-codeflare/notebook + name: latest + from: + kind: DockerImage + name: quay.io/project-codeflare/notebook:latest + name: "latest" + referencePolicy: + type: Source + importPolicy: + scheduled: true diff --git a/custom-nb-image/requirements.txt b/custom-nb-image/requirements.txt new file mode 100644 index 000000000..44938879e --- /dev/null +++ b/custom-nb-image/requirements.txt @@ -0,0 +1,205 @@ +# +# These requirements were autogenerated by pipenv +# To regenerate from the project's Pipfile, run: +# +# pipenv lock --requirements +# + +-i https://pypi.org/simple +aiohttp-cors==0.7.0 +aiohttp==3.8.3 +aiorwlock==1.3.0 +aiosignal==1.2.0; python_version >= '3.6' +anyio==3.6.1; python_full_version >= '3.6.2' +argon2-cffi-bindings==21.2.0; python_version >= '3.6' +argon2-cffi==21.3.0; python_version >= '3.6' +asgiref==3.5.2; python_version >= '3.7' +asttokens==2.0.8 +astunparse==1.6.3 +async-timeout==4.0.2; python_version >= '3.6' +attrs==22.1.0; python_version >= '3.5' +babel==2.10.3; python_version >= '3.6' +backcall==0.2.0 +bcrypt==4.0.0; python_version >= '3.6' +beautifulsoup4==4.11.1; python_version >= '3.6' +black==22.8.0; python_full_version >= '3.6.2' +bleach==5.0.1; python_version >= '3.7' +blessed==1.19.1; python_version >= '2.7' +boto3==1.17.11 +botocore==1.20.112; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' +cachetools==5.2.0; python_version ~= '3.7' +certifi==2022.9.24; python_version >= '3.6' +cffi==1.15.1 +charset-normalizer==2.1.1; python_version >= '3.6' +click==8.0.4; python_version >= '3.6' +cloudpickle==2.2.0; python_version >= '3.6' +codeflare==0.1.2.dev0 +colorama==0.4.5; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' +colorful==0.5.4 +contourpy==1.0.5; python_version >= '3.7' +cryptography==38.0.1; python_version >= '3.6' +cycler==0.11.0; python_version >= '3.6' +cython==0.29.32 +dask[array,dataframe]==2021.2.0 +dataclasses==0.6 +debugpy==1.6.3; python_version >= '3.7' +decorator==5.1.1; python_version >= '3.5' +defusedxml==0.7.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' +distlib==0.3.6 +entrypoints==0.4; python_version >= '3.6' +executing==1.1.0 +fastapi==0.85.0 +fastjsonschema==2.16.2 +filelock==3.8.0; python_version >= '3.7' +flatbuffers==22.9.24 +fonttools==4.37.3; python_version >= '3.7' +frozenlist==1.3.1; python_version >= '3.7' +fsspec==2022.8.2 +future==0.18.2; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3' +gitdb==4.0.9; python_version >= '3.6' +gitpython==3.1.27; python_version >= '3.7' +google-api-core==2.10.1; python_version >= '3.6' +google-auth==2.12.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' +googleapis-common-protos==1.56.4; python_version >= '3.7' +gpustat==1.0.0 +graphviz==0.20.1; python_version >= '3.7' +greenery==3.3.3 +grpcio==1.43.0; python_version >= '3.6' +h11==0.14.0; python_version >= '3.7' +hyperopt==0.2.5 +idna==3.4; python_version >= '3.5' +importlib-metadata==4.12.0; python_version < '3.10' +importlib-resources==5.9.0; python_version < '3.9' +ipykernel==6.16.0; python_version >= '3.7' +ipython-genutils==0.2.0 +ipython==8.5.0; python_version >= '3.8' +ipywidgets==8.0.2 +iso8601==1.1.0; python_version < '4' and python_full_version >= '3.6.2' +jedi==0.18.1; python_version >= '3.6' +jinja2==3.1.2; python_version >= '3.7' +jmespath==0.10.0; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3' +joblib==1.2.0; python_version >= '3.7' +json5==0.9.10 +jsonref==0.2 +jsonschema==4.16.0; python_version >= '3.7' +jsonsubschema==0.0.6 +jupyter-client==7.3.5; python_version >= '3.7' +jupyter-core==4.11.1; python_version >= '3.7' +jupyter-server-mathjax==0.2.6; python_version >= '3.7' +jupyter-server==1.19.1; python_version >= '3.7' +jupyterlab-git==0.30.0 +jupyterlab-pygments==0.2.2; python_version >= '3.7' +jupyterlab-s3-browser==0.10.1 +jupyterlab-server==2.15.2; python_version >= '3.7' +jupyterlab-widgets==3.0.3; python_version >= '3.7' +jupyterlab==3.4.7; python_version >= '3.7' +kiwisolver==1.4.4; python_version >= '3.7' +kopf==1.35.6 +kubernetes==24.2.0 +lale==0.6.19 +locket==1.0.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +lxml==4.9.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' +markupsafe==2.1.1; python_version >= '3.7' +matplotlib-inline==0.1.6; python_version >= '3.5' +matplotlib==3.6.0 +memory-profiler==0.60.0 +mistune==2.0.4 +msgpack==1.0.4 +multidict==6.0.2; python_version >= '3.7' +mypy-extensions==0.4.3 +nbclassic==0.4.3; python_version >= '3.7' +nbclient==0.6.8; python_version >= '3.7' +nbconvert==7.0.0; python_version >= '3.7' +nbdime==3.1.1; python_version >= '3.6' +nbformat==5.6.1; python_version >= '3.7' +nest-asyncio==1.5.5; python_version >= '3.5' +networkx==2.8.6; python_version >= '3.8' +notebook-shim==0.1.0; python_version >= '3.7' +notebook==6.4.12; python_version >= '3.7' +numpy==1.23.3 +nvidia-ml-py==11.495.46 +oauthlib==3.2.1; python_version >= '3.6' +opencensus-context==0.1.3 +opencensus==0.11.0 +openshift-client==1.0.18 +packaging==21.3 +pandas==1.5.0 +pandocfilters==1.5.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +paramiko==2.11.0 +parso==0.8.3; python_version >= '3.6' +partd==1.3.0 +pathspec==0.10.1; python_version >= '3.7' +pexpect==4.8.0; sys_platform != 'win32' +pickleshare==0.7.5 +pillow==9.2.0; python_version >= '3.7' +pkgutil-resolve-name==1.3.10; python_version < '3.9' +platformdirs==2.5.2; python_version >= '3.7' +portion==2.3.0; python_version ~= '3.6' +prometheus-client==0.13.1 +prompt-toolkit==3.0.31; python_full_version >= '3.6.2' +protobuf==3.20.1; python_version >= '3.7' +psutil==5.9.2; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +ptyprocess==0.7.0; os_name != 'nt' +pure-eval==0.2.2 +py-spy==0.3.14 +pyarrow==6.0.1 +pyasn1-modules==0.2.8 +pyasn1==0.4.8 +pycparser==2.21 +pydantic==1.10.2; python_version >= '3.7' +pygments==2.13.0; python_version >= '3.6' +pynacl==1.5.0; python_version >= '3.6' +pyparsing==3.0.9; python_full_version >= '3.6.8' +pyrsistent==0.18.1; python_version >= '3.7' +python-dateutil==2.8.2; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +python-dotenv==0.21.0 +python-json-logger==2.0.4; python_version >= '3.5' +pytz==2022.2.1 +pyyaml==6.0; python_version >= '3.6' +pyzmq==24.0.1; python_version >= '3.6' +ray==2.1.0 +requests-oauthlib==1.3.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +requests==2.28.1; python_version >= '3.7' and python_version < '4' +rsa==4.9; python_version >= '3.6' +s3fs==0.3.4 +s3transfer==0.3.7 +scikit-learn==1.1.1 +scipy==1.8.1 +send2trash==1.8.0 +singleton-decorator==1.0.0 +six==1.16.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +sklearn==0.0 +smart-open==6.2.0 +smmap==5.0.0; python_version >= '3.6' +sniffio==1.3.0; python_version >= '3.7' +sortedcontainers==2.4.0 +soupsieve==2.3.2.post1; python_version >= '3.6' +stack-data==0.5.1 +starlette==0.20.4 +tabulate==0.8.10 +tensorboardx==2.5.1 +terminado==0.15.0; python_version >= '3.7' +threadpoolctl==3.1.0; python_version >= '3.6' +tinycss2==1.1.1; python_version >= '3.6' +tomli==2.0.1; python_full_version < '3.11.0a7' +toolz==0.12.0 +torch==1.12.1 +torchvision==0.13.1 +tornado==6.2; python_version >= '3.7' +tqdm==4.64.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +traitlets==5.4.0; python_version >= '3.7' +tune-sklearn==0.4.3 +typing-extensions==4.3.0; python_version < '3.10' +urllib3==1.26.12 +uvicorn==0.16.0 +virtualenv==20.16.5; python_version >= '3.6' +wcwidth==0.2.5 +webencodings==0.5.1 +websocket-client==1.4.1; python_version >= '3.7' +wheel==0.37.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' +widgetsnbextension==4.0.3; python_version >= '3.7' +wrapt==1.14.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' +xgboost-ray==0.1.10 +xgboost==1.6.2 +yarl==1.8.1; python_version >= '3.7' +zipp==3.8.1; python_version < '3.10' diff --git a/demo-notebooks/batch-job/batch_mnist.ipynb b/demo-notebooks/batch-job/batch_mnist.ipynb new file mode 100644 index 000000000..6512c9be1 --- /dev/null +++ b/demo-notebooks/batch-job/batch_mnist.ipynb @@ -0,0 +1,2025 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 12, + "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a", + "metadata": {}, + "outputs": [], + "source": [ + "# Import pieces from codeflare-sdk\n", + "from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration\n", + "from codeflare_sdk.cluster.auth import TokenAuthentication" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "614daa0c", + "metadata": {}, + "outputs": [], + "source": [ + "# Create authentication object for oc user permissions\n", + "auth = TokenAuthentication(\n", + " token = \"XXXX\",\n", + " server = \"XXXX\",\n", + " skip_tls=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "bc27f84c", + "metadata": {}, + "source": [ + "Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding AppWrapper)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f4bc870-091f-4e11-9642-cba145710159", + "metadata": {}, + "outputs": [], + "source": [ + "# Create our cluster and submit appwrapper\n", + "cluster = Cluster(ClusterConfiguration(name='mnisttest', min_worker=2, max_worker=2, min_cpus=8, max_cpus=8, min_memory=16, max_memory=16, gpu=4, instascale=True, machine_types=[\"m5.xlarge\", \"p3.8xlarge\"], auth=auth))" + ] + }, + { + "cell_type": "markdown", + "id": "12eef53c", + "metadata": {}, + "source": [ + "Next, we want to bring our cluster up, so we call the `up()` function below to submit our cluster AppWrapper yaml onto the MCAD queue, and begin the process of obtaining our resource cluster." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0884bbc-c224-4ca0-98a0-02dfa09c2200", + "metadata": {}, + "outputs": [], + "source": [ + "# Bring up the cluster\n", + "cluster.up()" + ] + }, + { + "cell_type": "markdown", + "id": "657ebdfb", + "metadata": {}, + "source": [ + "Now, we want to check on the status of our resource cluster, until it is finally ready for use." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "3c1b4311-2e61-44c9-8225-87c2db11363d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
╭─────────────────────────╮\n",
+       "│   🚀 List of CodeFlare  │\n",
+       "│   clusters in queue🚀   │\n",
+       "│ +-----------+---------+ │\n",
+       "│ | Name      | Status  | │\n",
+       "│ +===========+=========+ │\n",
+       "│ | mnisttest | pending | │\n",
+       "│ |           |         | │\n",
+       "│ +-----------+---------+ │\n",
+       "╰─────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "╭─────────────────────────╮\n", + "│ \u001b[3m \u001b[0m\u001b[1;3m 🚀 List of CodeFlare\u001b[0m\u001b[3m \u001b[0m │\n", + "│ \u001b[3m \u001b[0m\u001b[1;3mclusters in queue🚀\u001b[0m\u001b[3m \u001b[0m │\n", + "│ +-----------+---------+ │\n", + "│ |\u001b[1m \u001b[0m\u001b[1mName \u001b[0m\u001b[1m \u001b[0m|\u001b[1m \u001b[0m\u001b[1mStatus \u001b[0m\u001b[1m \u001b[0m| │\n", + "│ +===========+=========+ │\n", + "│ |\u001b[36m \u001b[0m\u001b[36mmnisttest\u001b[0m\u001b[36m \u001b[0m|\u001b[35m \u001b[0m\u001b[35mpending\u001b[0m\u001b[35m \u001b[0m| │\n", + "│ |\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m|\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m| │\n", + "│ +-----------+---------+ │\n", + "╰─────────────────────────╯\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "(False, )" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cluster.is_ready()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "7fd45bc5-03c0-4ae5-9ec5-dd1c30f1a084", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
                  🚀 List of CodeFlare clusters 🚀                  \n",
+       "                                                                    \n",
+       " ╭────────────────────────────────────────────────────────────────╮ \n",
+       " │   Owner                                                        │ \n",
+       " │   mnisttest                                        Active ✅   │ \n",
+       " │                                                                │ \n",
+       " │   URI: ray://mnisttest-head-svc.default.svc:10001              │ \n",
+       " │                                                                │ \n",
+       " │   Dashboard🔗                                                  │ \n",
+       " │                                                                │ \n",
+       " │                      Cluster Resources                         │ \n",
+       " │   ╭─ Workers ──╮  ╭───────── Worker specs(each) ─────────╮     │ \n",
+       " │   │  Min  Max  │  │  Memory      CPU         GPU         │     │ \n",
+       " │   │            │  │                                      │     │ \n",
+       " │   │  2    2    │  │  16G~16G     8           4           │     │ \n",
+       " │   │            │  │                                      │     │ \n",
+       " │   ╰────────────╯  ╰──────────────────────────────────────╯     │ \n",
+       " ╰────────────────────────────────────────────────────────────────╯ \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m \u001b[0m\u001b[1;3m 🚀 List of CodeFlare clusters 🚀\u001b[0m\u001b[3m \u001b[0m\n", + "\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\n", + " ╭────────────────────────────────────────────────────────────────╮ \n", + " │ \u001b[1;37;42mOwner\u001b[0m │ \n", + " │ \u001b[1;4mmnisttest\u001b[0m Active ✅ │ \n", + " │ │ \n", + " │ \u001b[1mURI:\u001b[0m ray://mnisttest-head-svc.default.svc:10001 │ \n", + " │ │ \n", + " │ \u001b]8;id=309861;ray-dashboard-mnisttest-default.apps.prepfullinstall.psap.aws.rhperfscale.org\u001b\\\u001b[4;34mDashboard🔗\u001b[0m\u001b]8;;\u001b\\ │ \n", + " │ │ \n", + " │ \u001b[3m Cluster Resources \u001b[0m │ \n", + " │ ╭─ Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n", + " │ │ \u001b[1m \u001b[0m\u001b[1mMin\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mMax\u001b[0m\u001b[1m \u001b[0m │ │ \u001b[1m \u001b[0m\u001b[1mMemory \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mCPU \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mGPU \u001b[0m\u001b[1m \u001b[0m │ │ \n", + " │ │ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \n", + " │ │ \u001b[36m \u001b[0m\u001b[36m2 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m2 \u001b[0m\u001b[35m \u001b[0m │ │ \u001b[36m \u001b[0m\u001b[36m16G~16G \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m8 \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m4 \u001b[0m\u001b[35m \u001b[0m │ │ \n", + " │ │ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \n", + " │ ╰────────────╯ ╰──────────────────────────────────────╯ │ \n", + " ╰────────────────────────────────────────────────────────────────╯ \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cluster.status()" + ] + }, + { + "cell_type": "markdown", + "id": "87d2c9b3", + "metadata": {}, + "source": [ + "Now that our resource cluster is ready, we can directly submit our batch job (model training on two workers with four gpus each) to the cluster via torchx." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "3cc6183a-8f6e-4347-af91-d088ed422544", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "environemnt before exec ddp from torchx {'LOGLEVEL': 'DEBUG', 'TORCH_DISTRIBUTED_DEBUG': 'DETAIL'}\n", + "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 INFO \u001b[0m Checking for changes in workspace `file:///opt/app-root/src/codeflare/notebooks/jobs`...\n", + "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 INFO \u001b[0m To disable workspaces pass: --workspace=\"\" from CLI or workspace=None programmatically.\n", + "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 INFO \u001b[0m Built new image `/tmp/torchx_workspace3c_d437b` based on original image `ghcr.io/pytorch/torchx:0.3.0dev0` and changes in workspace `file:///opt/app-root/src/codeflare/notebooks/jobs` for role[0]=mnist.\n", + "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 WARNING \u001b[0m The Ray scheduler does not support port mapping.\n", + "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 INFO \u001b[0m Uploading package gcs://_ray_pkg_ce2c3e935774455d.zip.\n", + "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 INFO \u001b[0m Creating a file package for local directory '/tmp/torchx_workspace3c_d437b'.\n", + "ray://torchx/mnisttest-head-svc.default.svc:8265-mnist-jlm13hx5g53mk\n", + "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 INFO \u001b[0m Launched app: ray://torchx/mnisttest-head-svc.default.svc:8265-mnist-jlm13hx5g53mk\n", + "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 INFO \u001b[0m AppStatus:\n", + " msg: PENDING\n", + " num_restarts: -1\n", + " roles:\n", + " - replicas:\n", + " - hostname: \n", + " id: 0\n", + " role: ray\n", + " state: !!python/object/apply:torchx.specs.api.AppState\n", + " - 2\n", + " structured_error_msg: \n", + " role: ray\n", + " state: PENDING (2)\n", + " structured_error_msg: \n", + " ui_url: null\n", + "\n", + "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 INFO \u001b[0m Job URL: None\n", + "\u001b[0m" + ] + } + ], + "source": [ + "! torchx run -s ray -cfg dashboard_address=mnisttest-head-svc.default.svc:8265,requirements=requirements.txt dist.ddp -j 2x4 --gpu 4 --script mnist.py" + ] + }, + { + "cell_type": "markdown", + "id": "ff065051", + "metadata": {}, + "source": [ + "Now we can go ahead and look at the status and logs of our batch job." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "ced6ccd6-a17e-413a-a0e4-65004fc35463", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[37mJob submission server address\u001b[39m: \u001b[1mhttp://mnisttest-head-svc.default.svc:8265\u001b[22m\n", + "{'mnist-jlm13hx5g53mk': JobInfo(status='SUCCEEDED', entrypoint='python3 ray_driver.py', message='Job finished successfully.', error_type=None, start_time=1667574271415, end_time=1667574616127, metadata={}, runtime_env={'working_dir': 'gcs://_ray_pkg_ce2c3e935774455d.zip', 'pip': {'packages': ['pytorch_lightning==1.5.10', 'ray_lightning', 'torchmetrics==0.9.1', 'torchvision==0.12.0'], 'pip_check': False}, '_ray_commit': 'e4ce38d001dbbe09cd21c497fedd03d692b2be3e'})}\n", + "\u001b[0m" + ] + } + ], + "source": [ + "cluster.list_jobs()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "e5c0b0da-c22e-4142-b096-407ac8aebe5e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[37mJob submission server address\u001b[39m: \u001b[1mhttp://mnisttest-head-svc.default.svc:8265\u001b[22m\n", + "\n", + "\u001b[32m-----------------------------------\u001b[39m\n", + "\u001b[32mJob 'mnist-jlm13hx5g53mk' succeeded\u001b[39m\n", + "\u001b[32m-----------------------------------\u001b[39m\n", + "\n", + "\u001b[0m" + ] + } + ], + "source": [ + "cluster.job_status(\"mnist-jlm13hx5g53mk\")" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "264c1809-de72-4acf-b0f6-e67d345640f6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[37mJob submission server address\u001b[39m: \u001b[1mhttp://mnisttest-head-svc.default.svc:8265\u001b[22m\n", + "acrtors: [RayActor(name='mnist', command=['bash', '-c', \"python -m torch.distributed.run --rdzv_backend static --rdzv_endpoint $TORCHX_RANK0_HOST:49782 --rdzv_id 'mnist-jlm13hx5g53mk' --nnodes 2 --nproc_per_node 4 --node_rank '0' --tee 3 --role '' mnist.py\"], env={'LOGLEVEL': 'DEBUG', 'TORCH_DISTRIBUTED_DEBUG': 'DETAIL'}, num_cpus=2, num_gpus=4), RayActor(name='mnist', command=['bash', '-c', \"python -m torch.distributed.run --rdzv_backend static --rdzv_endpoint $TORCHX_RANK0_HOST:49782 --rdzv_id 'mnist-jlm13hx5g53mk' --nnodes 2 --nproc_per_node 4 --node_rank '1' --tee 3 --role '' mnist.py\"], env={'LOGLEVEL': 'DEBUG', 'TORCH_DISTRIBUTED_DEBUG': 'DETAIL'}, num_cpus=2, num_gpus=4)]\n", + "Waiting for placement group to start.\n", + "here and rank is 0 and 10.131.66.16 49782\n", + "finally setting actor remote address and port 10.131.66.16 49782\n", + "here and rank is 1 and 10.131.66.16 49782\n", + "setting actor remote address and port 10.131.66.16 49782\n", + "finally setting actor remote address and port 10.131.66.16 49782\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m get_actor_address_and_port before: 10.131.66.16 42903\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m get_actor_address_and_port: 10.131.66.16 42903\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m set_address_and_port: 10.131.66.16 49782\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m get_actor_address_and_port before: 10.131.66.16 53621\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m get_actor_address_and_port: 10.131.66.16 53621\n", + "running ray.wait on [ObjectRef(32b0eec39cfa87ac523554acce28b667f9bc98bb0200000001000000), ObjectRef(80b655a2d9b04d4074fb8e3cef07ab2b3516f40e0200000001000000)]\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m cmd: ['bash', '-c', \"python -m torch.distributed.run --rdzv_backend static --rdzv_endpoint $TORCHX_RANK0_HOST:49782 --rdzv_id 'mnist-jlm13hx5g53mk' --nnodes 2 --nproc_per_node 4 --node_rank '0' --tee 3 --role '' mnist.py\"]\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m worker env: {'NV_LIBCUBLAS_DEV_VERSION': '11.3.1.68-1', 'NV_CUDA_COMPAT_PACKAGE': 'cuda-compat-11-2', 'RAY_IP': 'mnisttest-head-svc', 'MNISTTEST_HEAD_SVC_SERVICE_PORT_GCS': '6379', 'NV_CUDNN_PACKAGE_DEV': 'libcudnn8-dev=8.1.1.33-1+cuda11.2', 'LC_ALL': 'C.UTF-8', 'LD_LIBRARY_PATH': '/usr/local/nvidia/lib:/usr/local/nvidia/lib64', 'NV_LIBNCCL_DEV_PACKAGE': 'libnccl-dev=2.8.4-1+cuda11.2', 'REDIS_PASSWORD': '', 'MNISTTEST_HEAD_SVC_PORT_8265_TCP_PORT': '8265', 'RAY_USAGE_STATS_ENABLED': '0', 'LANG': 'C.UTF-8', 'TZ': 'America/Los_Angeles', 'NV_LIBNPP_DEV_PACKAGE': 'libnpp-dev-11-2=11.2.1.68-1', 'HOSTNAME': 'mnisttest-worker-small-group-mnisttest-wzz2l', 'MNISTTEST_HEAD_SVC_PORT_8265_TCP': 'tcp://172.30.163.155:8265', 'OLDPWD': '/home/ray/workspace', 'MNISTTEST_HEAD_SVC_PORT_10001_TCP_ADDR': '172.30.163.155', 'RAY_CLIENT_MODE': '0', 'RAY_JOB_ID': '02000000', 'MNISTTEST_HEAD_SVC_PORT_6379_TCP_ADDR': '172.30.163.155', 'MNISTTEST_HEAD_SVC_PORT_8265_TCP_ADDR': '172.30.163.155', 'NV_LIBNPP_VERSION': '11.2.1.68-1', 'MNISTTEST_HEAD_SVC_PORT_6379_TCP_PROTO': 'tcp', 'NVIDIA_VISIBLE_DEVICES': 'GPU-d3e8af45-f80b-98a8-dcd8-d3b428c4a4c2,GPU-15e57e64-c38b-9923-8f4a-6c098fdbc062,GPU-d14042c5-219c-5419-9511-ac62c72f90d1,GPU-b0d6ba11-ccb2-c4fb-89ad-01c50e6d393c', 'VIRTUAL_ENV': '/tmp/ray/session_2022-11-04_08-02-48_207951_7/runtime_resources/pip/3510e0c008a5c3627e4d2408c8b93ed71be6c3e1/virtualenv', 'NV_LIBCUSPARSE_VERSION': '11.3.1.68-1', 'MNISTTEST_HEAD_SVC_SERVICE_PORT_DASHBOARD': '8265', 'MNISTTEST_HEAD_SVC_SERVICE_PORT_CLIENT': '10001', 'KUBERNETES_PORT_443_TCP_PROTO': 'tcp', 'MNISTTEST_HEAD_SVC_PORT_6379_TCP_PORT': '6379', 'KUBERNETES_PORT_443_TCP_ADDR': '172.30.0.1', 'NV_LIBCUBLAS_DEV_PACKAGE': 'libcublas-dev-11-2=11.3.1.68-1', 'NCCL_VERSION': '2.8.4-1', 'KUBERNETES_PORT': 'tcp://172.30.0.1:443', 'PWD': '/tmp/ray/session_2022-11-04_08-02-48_207951_7/runtime_resources/working_dir_files/_ray_pkg_ce2c3e935774455d', 'NVARCH': 'x86_64', 'NV_LIBCUSPARSE_DEV_VERSION': '11.3.1.68-1', 'HOME': '/home/ray', 'RAY_RAYLET_PID': '19', 'NV_ML_REPO_URL': 'https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64', 'NV_LIBNCCL_PACKAGE_VERSION': '2.8.4-1', 'SPT_NOENV': '1', 'KUBERNETES_SERVICE_PORT_HTTPS': '443', 'NV_LIBNCCL_PACKAGE': 'libnccl2=2.8.4-1+cuda11.2', 'NV_LIBNCCL_DEV_PACKAGE_NAME': 'libnccl-dev', 'KUBERNETES_PORT_443_TCP_PORT': '443', 'NV_CUDA_LIB_VERSION': '11.2.0-1', 'NV_ML_REPO_ENABLED': '1', 'NV_LIBNPP_PACKAGE': 'libnpp-11-2=11.2.1.68-1', 'NV_LIBNCCL_PACKAGE_NAME': 'libnccl2', 'LIBRARY_PATH': '/usr/local/cuda/lib64/stubs', 'NV_NVTX_VERSION': '11.2.67-1', 'MNISTTEST_HEAD_SVC_PORT_10001_TCP': 'tcp://172.30.163.155:10001', 'NV_LIBCUBLAS_VERSION': '11.3.1.68-1', 'RAY_ADDRESS': 'mnisttest-head-svc:6379', 'NV_LIBCUBLAS_PACKAGE': 'libcublas-11-2=11.3.1.68-1', 'KUBERNETES_PORT_443_TCP': 'tcp://172.30.0.1:443', 'NV_CUDNN_VERSION': '8.1.1.33', 'RAY_PORT': '6379', 'NV_CUDA_CUDART_DEV_VERSION': '11.2.72-1', 'MNISTTEST_HEAD_SVC_PORT_6379_TCP': 'tcp://172.30.163.155:6379', 'MNISTTEST_HEAD_SVC_PORT_8265_TCP_PROTO': 'tcp', 'MNISTTEST_HEAD_SVC_PORT_10001_TCP_PORT': '10001', 'TERM': 'xterm', 'MNISTTEST_HEAD_SVC_SERVICE_PORT': '6379', 'NV_NVML_DEV_VERSION': '11.2.67-1', 'CUDA_VERSION': '11.2.0', 'NV_LIBCUBLAS_PACKAGE_NAME': 'libcublas-11-2', 'NSS_SDB_USE_CACHE': 'no', 'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility', 'MY_POD_IP': '10.131.66.16', 'SHLVL': '1', 'PYTHONPATH': ':/tmp/ray/session_2022-11-04_08-02-48_207951_7/runtime_resources/working_dir_files/_ray_pkg_ce2c3e935774455d:/home/ray/workspace::/home/ray/workspace:', 'NV_LIBCUBLAS_DEV_PACKAGE_NAME': 'libcublas-dev-11-2', 'NVIDIA_REQUIRE_CUDA': 'cuda>=11.2 brand=tesla,driver>=418,driver<419 brand=tesla,driver>=440,driver<441 driver>=450', 'NV_LIBNPP_DEV_VERSION': '11.2.1.68-1', 'KUBERNETES_SERVICE_PORT': '443', 'NV_CUDA_CUDART_VERSION': '11.2.72-1', 'NV_CUDNN_PACKAGE_NAME': 'libcudnn8', 'PATH': '/tmp/ray/session_2022-11-04_08-02-48_207951_7/runtime_resources/pip/3510e0c008a5c3627e4d2408c8b93ed71be6c3e1/virtualenv/bin:/home/ray/anaconda3/bin:/home/ray/anaconda3/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin', 'NV_LIBNCCL_DEV_PACKAGE_VERSION': '2.8.4-1', 'MNISTTEST_HEAD_SVC_PORT': 'tcp://172.30.163.155:6379', 'PS1': '(virtualenv) ', 'MNISTTEST_HEAD_SVC_PORT_10001_TCP_PROTO': 'tcp', 'MNISTTEST_HEAD_SVC_SERVICE_HOST': '172.30.163.155', 'KUBERNETES_SERVICE_HOST': '172.30.0.1', 'NV_CUDNN_PACKAGE': 'libcudnn8=8.1.1.33-1+cuda11.2', 'OMP_NUM_THREADS': '1', 'PYTHONBREAKPOINT': 'ray.util.rpdb.set_trace', 'CUDA_VISIBLE_DEVICES': '0,1,2,3', 'LOGLEVEL': 'DEBUG', 'TORCH_DISTRIBUTED_DEBUG': 'DETAIL', 'TORCHX_RANK0_HOST': '10.131.66.16'}\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m set_address_and_port: 10.131.66.16 49782\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m cmd: ['bash', '-c', \"python -m torch.distributed.run --rdzv_backend static --rdzv_endpoint $TORCHX_RANK0_HOST:49782 --rdzv_id 'mnist-jlm13hx5g53mk' --nnodes 2 --nproc_per_node 4 --node_rank '1' --tee 3 --role '' mnist.py\"]\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m worker env: {'NV_LIBCUBLAS_DEV_VERSION': '11.3.1.68-1', 'NV_CUDA_COMPAT_PACKAGE': 'cuda-compat-11-2', 'RAY_IP': 'mnisttest-head-svc', 'MNISTTEST_HEAD_SVC_SERVICE_PORT_GCS': '6379', 'NV_CUDNN_PACKAGE_DEV': 'libcudnn8-dev=8.1.1.33-1+cuda11.2', 'LC_ALL': 'C.UTF-8', 'LD_LIBRARY_PATH': '/usr/local/nvidia/lib:/usr/local/nvidia/lib64', 'NV_LIBNCCL_DEV_PACKAGE': 'libnccl-dev=2.8.4-1+cuda11.2', 'REDIS_PASSWORD': '', 'MNISTTEST_HEAD_SVC_PORT_8265_TCP_PORT': '8265', 'RAY_USAGE_STATS_ENABLED': '0', 'LANG': 'C.UTF-8', 'TZ': 'America/Los_Angeles', 'NV_LIBNPP_DEV_PACKAGE': 'libnpp-dev-11-2=11.2.1.68-1', 'HOSTNAME': 'mnisttest-worker-small-group-mnisttest-hfm8l', 'MNISTTEST_HEAD_SVC_PORT_8265_TCP': 'tcp://172.30.163.155:8265', 'OLDPWD': '/home/ray/workspace', 'MNISTTEST_HEAD_SVC_PORT_10001_TCP_ADDR': '172.30.163.155', 'RAY_CLIENT_MODE': '0', 'RAY_JOB_ID': '02000000', 'MNISTTEST_HEAD_SVC_PORT_6379_TCP_ADDR': '172.30.163.155', 'MNISTTEST_HEAD_SVC_PORT_8265_TCP_ADDR': '172.30.163.155', 'NV_LIBNPP_VERSION': '11.2.1.68-1', 'MNISTTEST_HEAD_SVC_PORT_6379_TCP_PROTO': 'tcp', 'NVIDIA_VISIBLE_DEVICES': 'GPU-48fae530-6bda-e366-3423-864fe847ff3b,GPU-5d8d79bb-5c38-4ef7-0ea8-c91297cbc59f,GPU-8c8b3c0b-ccf8-c06c-f253-0bb90285c4cb,GPU-a8a4e808-841d-c212-2686-a2bd227279b3', 'VIRTUAL_ENV': '/tmp/ray/session_2022-11-04_08-02-48_207951_7/runtime_resources/pip/3510e0c008a5c3627e4d2408c8b93ed71be6c3e1/virtualenv', 'NV_LIBCUSPARSE_VERSION': '11.3.1.68-1', 'MNISTTEST_HEAD_SVC_SERVICE_PORT_DASHBOARD': '8265', 'MNISTTEST_HEAD_SVC_SERVICE_PORT_CLIENT': '10001', 'KUBERNETES_PORT_443_TCP_PROTO': 'tcp', 'MNISTTEST_HEAD_SVC_PORT_6379_TCP_PORT': '6379', 'KUBERNETES_PORT_443_TCP_ADDR': '172.30.0.1', 'NV_LIBCUBLAS_DEV_PACKAGE': 'libcublas-dev-11-2=11.3.1.68-1', 'NCCL_VERSION': '2.8.4-1', 'KUBERNETES_PORT': 'tcp://172.30.0.1:443', 'PWD': '/tmp/ray/session_2022-11-04_08-02-48_207951_7/runtime_resources/working_dir_files/_ray_pkg_ce2c3e935774455d', 'NVARCH': 'x86_64', 'NV_LIBCUSPARSE_DEV_VERSION': '11.3.1.68-1', 'HOME': '/home/ray', 'RAY_RAYLET_PID': '19', 'NV_ML_REPO_URL': 'https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64', 'NV_LIBNCCL_PACKAGE_VERSION': '2.8.4-1', 'SPT_NOENV': '1', 'KUBERNETES_SERVICE_PORT_HTTPS': '443', 'NV_LIBNCCL_PACKAGE': 'libnccl2=2.8.4-1+cuda11.2', 'NV_LIBNCCL_DEV_PACKAGE_NAME': 'libnccl-dev', 'KUBERNETES_PORT_443_TCP_PORT': '443', 'NV_CUDA_LIB_VERSION': '11.2.0-1', 'NV_ML_REPO_ENABLED': '1', 'NV_LIBNPP_PACKAGE': 'libnpp-11-2=11.2.1.68-1', 'NV_LIBNCCL_PACKAGE_NAME': 'libnccl2', 'LIBRARY_PATH': '/usr/local/cuda/lib64/stubs', 'NV_NVTX_VERSION': '11.2.67-1', 'MNISTTEST_HEAD_SVC_PORT_10001_TCP': 'tcp://172.30.163.155:10001', 'NV_LIBCUBLAS_VERSION': '11.3.1.68-1', 'RAY_ADDRESS': 'mnisttest-head-svc:6379', 'NV_LIBCUBLAS_PACKAGE': 'libcublas-11-2=11.3.1.68-1', 'KUBERNETES_PORT_443_TCP': 'tcp://172.30.0.1:443', 'NV_CUDNN_VERSION': '8.1.1.33', 'RAY_PORT': '6379', 'NV_CUDA_CUDART_DEV_VERSION': '11.2.72-1', 'MNISTTEST_HEAD_SVC_PORT_6379_TCP': 'tcp://172.30.163.155:6379', 'MNISTTEST_HEAD_SVC_PORT_8265_TCP_PROTO': 'tcp', 'MNISTTEST_HEAD_SVC_PORT_10001_TCP_PORT': '10001', 'TERM': 'xterm', 'MNISTTEST_HEAD_SVC_SERVICE_PORT': '6379', 'NV_NVML_DEV_VERSION': '11.2.67-1', 'CUDA_VERSION': '11.2.0', 'NV_LIBCUBLAS_PACKAGE_NAME': 'libcublas-11-2', 'NSS_SDB_USE_CACHE': 'no', 'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility', 'MY_POD_IP': '10.128.68.15', 'SHLVL': '1', 'PYTHONPATH': ':/tmp/ray/session_2022-11-04_08-02-48_207951_7/runtime_resources/working_dir_files/_ray_pkg_ce2c3e935774455d:/home/ray/workspace::/home/ray/workspace:', 'NV_LIBCUBLAS_DEV_PACKAGE_NAME': 'libcublas-dev-11-2', 'NVIDIA_REQUIRE_CUDA': 'cuda>=11.2 brand=tesla,driver>=418,driver<419 brand=tesla,driver>=440,driver<441 driver>=450', 'NV_LIBNPP_DEV_VERSION': '11.2.1.68-1', 'KUBERNETES_SERVICE_PORT': '443', 'NV_CUDA_CUDART_VERSION': '11.2.72-1', 'NV_CUDNN_PACKAGE_NAME': 'libcudnn8', 'PATH': '/tmp/ray/session_2022-11-04_08-02-48_207951_7/runtime_resources/pip/3510e0c008a5c3627e4d2408c8b93ed71be6c3e1/virtualenv/bin:/home/ray/anaconda3/bin:/home/ray/anaconda3/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin', 'NV_LIBNCCL_DEV_PACKAGE_VERSION': '2.8.4-1', 'MNISTTEST_HEAD_SVC_PORT': 'tcp://172.30.163.155:6379', 'PS1': '(virtualenv) ', 'MNISTTEST_HEAD_SVC_PORT_10001_TCP_PROTO': 'tcp', 'MNISTTEST_HEAD_SVC_SERVICE_HOST': '172.30.163.155', 'KUBERNETES_SERVICE_HOST': '172.30.0.1', 'NV_CUDNN_PACKAGE': 'libcudnn8=8.1.1.33-1+cuda11.2', 'OMP_NUM_THREADS': '1', 'PYTHONBREAKPOINT': 'ray.util.rpdb.set_trace', 'CUDA_VISIBLE_DEVICES': '0,1,2,3', 'LOGLEVEL': 'DEBUG', 'TORCH_DISTRIBUTED_DEBUG': 'DETAIL', 'TORCHX_RANK0_HOST': '10.131.66.16'}\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.launcher.api:Starting elastic_operator with launch configs:\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m entrypoint : mnist.py\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m min_nodes : 2\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m max_nodes : 2\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m nproc_per_node : 4\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m run_id : mnist-jlm13hx5g53mk\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m rdzv_backend : static\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m rdzv_endpoint : 10.131.66.16:49782\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m rdzv_configs : {'rank': 1, 'timeout': 900}\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m max_restarts : 0\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m monitor_interval : 5\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m log_dir : None\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m metrics_cfg : {}\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m \n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.agent.server.local_elastic_agent:log directory set to: /tmp/torchelastic_d2kdqlka/mnist-jlm13hx5g53mk_r9bujvap\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] starting workers for entrypoint: python\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Rendezvous'ing worker group\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.launcher.api:Starting elastic_operator with launch configs:\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m entrypoint : mnist.py\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m min_nodes : 2\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m max_nodes : 2\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m nproc_per_node : 4\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m run_id : mnist-jlm13hx5g53mk\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m rdzv_backend : static\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m rdzv_endpoint : 10.131.66.16:49782\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m rdzv_configs : {'rank': 0, 'timeout': 900}\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m max_restarts : 0\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m monitor_interval : 5\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m log_dir : None\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m metrics_cfg : {}\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m \n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.agent.server.local_elastic_agent:log directory set to: /tmp/torchelastic_d4z71nty/mnist-jlm13hx5g53mk_nxz_och1\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] starting workers for entrypoint: python\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Rendezvous'ing worker group\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Rendezvous complete for workers. Result:\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m restart_count=0\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m master_addr=10.131.66.16\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m master_port=49782\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m group_rank=1\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m group_world_size=2\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m local_ranks=[0, 1, 2, 3]\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m role_ranks=[4, 5, 6, 7]\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m global_ranks=[4, 5, 6, 7]\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m role_world_sizes=[8, 8, 8, 8]\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m global_world_sizes=[8, 8, 8, 8]\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m \n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Starting worker group\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker0 reply file to: /tmp/torchelastic_d2kdqlka/mnist-jlm13hx5g53mk_r9bujvap/attempt_0/0/error.json\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker1 reply file to: /tmp/torchelastic_d2kdqlka/mnist-jlm13hx5g53mk_r9bujvap/attempt_0/1/error.json\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker2 reply file to: /tmp/torchelastic_d2kdqlka/mnist-jlm13hx5g53mk_r9bujvap/attempt_0/2/error.json\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker3 reply file to: /tmp/torchelastic_d2kdqlka/mnist-jlm13hx5g53mk_r9bujvap/attempt_0/3/error.json\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Rendezvous complete for workers. Result:\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m restart_count=0\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m master_addr=10.131.66.16\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m master_port=49782\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m group_rank=0\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m group_world_size=2\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m local_ranks=[0, 1, 2, 3]\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m role_ranks=[0, 1, 2, 3]\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m global_ranks=[0, 1, 2, 3]\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m role_world_sizes=[8, 8, 8, 8]\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m global_world_sizes=[8, 8, 8, 8]\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m \n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Starting worker group\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker0 reply file to: /tmp/torchelastic_d4z71nty/mnist-jlm13hx5g53mk_nxz_och1/attempt_0/0/error.json\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker1 reply file to: /tmp/torchelastic_d4z71nty/mnist-jlm13hx5g53mk_nxz_och1/attempt_0/1/error.json\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker2 reply file to: /tmp/torchelastic_d4z71nty/mnist-jlm13hx5g53mk_nxz_och1/attempt_0/2/error.json\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker3 reply file to: /tmp/torchelastic_d4z71nty/mnist-jlm13hx5g53mk_nxz_och1/attempt_0/3/error.json\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [3]:prior to running the trainer\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [3]:MASTER_ADDR: is 10.131.66.16\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [3]:MASTER_PORT: is 49782\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [3]:GROUP: 2\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [3]:LOCAL: 4\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:prior to running the trainer\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:MASTER_ADDR: is 10.131.66.16\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:MASTER_PORT: is 49782\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:GROUP: 2\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:LOCAL: 4\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading MNIST dataset...\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [2]:prior to running the trainer\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [2]:MASTER_ADDR: is 10.131.66.16\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [2]:MASTER_PORT: is 49782\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [2]:GROUP: 2\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [2]:LOCAL: 4\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [1]:prior to running the trainer\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [1]:MASTER_ADDR: is 10.131.66.16\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [1]:MASTER_PORT: is 49782\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [1]:GROUP: 2\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [1]:LOCAL: 4\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./MNIST/raw/train-images-idx3-ubyte.gz\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Extracting ./MNIST/raw/train-images-idx3-ubyte.gz to ./MNIST/raw\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./MNIST/raw/train-labels-idx1-ubyte.gz\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Extracting ./MNIST/raw/train-labels-idx1-ubyte.gz to ./MNIST/raw\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./MNIST/raw/t10k-images-idx3-ubyte.gz\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Extracting ./MNIST/raw/t10k-images-idx3-ubyte.gz to ./MNIST/raw\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./MNIST/raw/t10k-labels-idx1-ubyte.gz\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Extracting ./MNIST/raw/t10k-labels-idx1-ubyte.gz to ./MNIST/raw\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Validation sanity check: 0it [00:00, ?it/s][0]:\n", + "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Validation sanity check: 0%| | 0/2 [00:00╭─────────────────────────╮\n", + "│ 🚀 List of CodeFlare │\n", + "│ clusters in queue🚀 │\n", + "│ +-----------+---------+ │\n", + "│ | Name | Status | │\n", + "│ +===========+=========+ │\n", + "│ | hfgputest | pending | │\n", + "│ | | | │\n", + "│ +-----------+---------+ │\n", + "╰─────────────────────────╯\n", + "\n" + ], + "text/plain": [ + "╭─────────────────────────╮\n", + "│ \u001b[3m \u001b[0m\u001b[1;3m 🚀 List of CodeFlare\u001b[0m\u001b[3m \u001b[0m │\n", + "│ \u001b[3m \u001b[0m\u001b[1;3mclusters in queue🚀\u001b[0m\u001b[3m \u001b[0m │\n", + "│ +-----------+---------+ │\n", + "│ |\u001b[1m \u001b[0m\u001b[1mName \u001b[0m\u001b[1m \u001b[0m|\u001b[1m \u001b[0m\u001b[1mStatus \u001b[0m\u001b[1m \u001b[0m| │\n", + "│ +===========+=========+ │\n", + "│ |\u001b[36m \u001b[0m\u001b[36mhfgputest\u001b[0m\u001b[36m \u001b[0m|\u001b[35m \u001b[0m\u001b[35mpending\u001b[0m\u001b[35m \u001b[0m| │\n", + "│ |\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m|\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m| │\n", + "│ +-----------+---------+ │\n", + "╰─────────────────────────╯\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "(False, )" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cluster.is_ready()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "06a54428-f186-4c27-948e-4eaf9c0e34b5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
                  🚀 List of CodeFlare clusters 🚀                  \n",
+       "                                                                    \n",
+       " ╭────────────────────────────────────────────────────────────────╮ \n",
+       " │   Owner                                                        │ \n",
+       " │   hfgputest                                        Active ✅   │ \n",
+       " │                                                                │ \n",
+       " │   URI: ray://hfgputest-head-svc.default.svc:10001              │ \n",
+       " │                                                                │ \n",
+       " │   Dashboard🔗                                                  │ \n",
+       " │                                                                │ \n",
+       " │                      Cluster Resources                         │ \n",
+       " │   ╭─ Workers ──╮  ╭───────── Worker specs(each) ─────────╮     │ \n",
+       " │   │  Min  Max  │  │  Memory      CPU         GPU         │     │ \n",
+       " │   │            │  │                                      │     │ \n",
+       " │   │  1    1    │  │  16G~16G     8           4           │     │ \n",
+       " │   │            │  │                                      │     │ \n",
+       " │   ╰────────────╯  ╰──────────────────────────────────────╯     │ \n",
+       " ╰────────────────────────────────────────────────────────────────╯ \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m \u001b[0m\u001b[1;3m 🚀 List of CodeFlare clusters 🚀\u001b[0m\u001b[3m \u001b[0m\n", + "\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\n", + " ╭────────────────────────────────────────────────────────────────╮ \n", + " │ \u001b[1;37;42mOwner\u001b[0m │ \n", + " │ \u001b[1;4mhfgputest\u001b[0m Active ✅ │ \n", + " │ │ \n", + " │ \u001b[1mURI:\u001b[0m ray://hfgputest-head-svc.default.svc:10001 │ \n", + " │ │ \n", + " │ \u001b]8;id=552692;ray-dashboard-hfgputest-default.apps.prepfullinstall.psap.aws.rhperfscale.org\u001b\\\u001b[4;34mDashboard🔗\u001b[0m\u001b]8;;\u001b\\ │ \n", + " │ │ \n", + " │ \u001b[3m Cluster Resources \u001b[0m │ \n", + " │ ╭─ Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n", + " │ │ \u001b[1m \u001b[0m\u001b[1mMin\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mMax\u001b[0m\u001b[1m \u001b[0m │ │ \u001b[1m \u001b[0m\u001b[1mMemory \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mCPU \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mGPU \u001b[0m\u001b[1m \u001b[0m │ │ \n", + " │ │ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \n", + " │ │ \u001b[36m \u001b[0m\u001b[36m1 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1 \u001b[0m\u001b[35m \u001b[0m │ │ \u001b[36m \u001b[0m\u001b[36m16G~16G \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m8 \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m4 \u001b[0m\u001b[35m \u001b[0m │ │ \n", + " │ │ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \n", + " │ ╰────────────╯ ╰──────────────────────────────────────╯ │ \n", + " ╰────────────────────────────────────────────────────────────────╯ \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cluster.status()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "8ac46c87-70f1-4c70-9648-881151665355", + "metadata": {}, + "outputs": [], + "source": [ + "ray_cluster_uri = cluster.cluster_uri()" + ] + }, + { + "cell_type": "markdown", + "id": "44dba6a0-8275-4726-8911-6b6ec467b6a3", + "metadata": {}, + "source": [ + "**NOTE**: Now we have our resource cluster with the desired GPUs, so we can interact with it to train the HuggingFace model." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "4c458589-5a17-47c6-a8db-625427ae4fe7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ray cluster is up and running: True\n" + ] + } + ], + "source": [ + "#before proceeding make sure the cluster exists and the uri is not empty\n", + "assert ray_cluster_uri, \"Ray cluster needs to be started and set before proceeding\"\n", + "\n", + "import ray\n", + "from ray.air.config import ScalingConfig\n", + "\n", + "# reset the ray context in case there's already one. \n", + "ray.shutdown()\n", + "# establish connection to ray cluster\n", + "\n", + "#install additionall libraries that will be required for this training\n", + "runtime_env = {\"pip\": [\"transformers\", \"datasets\", \"evaluate\"]}\n", + "\n", + "ray.init(address=f'{ray_cluster_uri}', runtime_env=runtime_env)\n", + "\n", + "print(\"Ray cluster is up and running: \", ray.is_initialized())" + ] + }, + { + "cell_type": "markdown", + "id": "94a38146-1321-4b7b-9152-9ebca4eb9444", + "metadata": {}, + "source": [ + "**NOTE** : in this case since we are running a task for which we need additional pip packages. we can install those by passing them in the `runtime_env` variable" + ] + }, + { + "cell_type": "markdown", + "id": "76a1945b-d6c8-49b8-9a4c-b82724cffba9", + "metadata": {}, + "source": [ + "### Transfer learning code from huggingface" + ] + }, + { + "cell_type": "markdown", + "id": "8bdbe888-4f38-4e9a-ae43-67ce89ff9d42", + "metadata": {}, + "source": [ + "We are using the code based on the example **[here](https://huggingface.co/docs/transformers/tasks/sequence_classification)** . " + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "e69994b4-1a13-43fe-b698-2a5374cb941b", + "metadata": {}, + "outputs": [], + "source": [ + "@ray.remote\n", + "def train_fn():\n", + " from datasets import load_dataset\n", + " import transformers\n", + " from transformers import AutoTokenizer, TrainingArguments\n", + " from transformers import AutoModelForSequenceClassification\n", + " import numpy as np\n", + " from datasets import load_metric\n", + " import ray\n", + " from ray import tune\n", + " from ray.train.huggingface import HuggingFaceTrainer\n", + "\n", + " dataset = load_dataset(\"imdb\")\n", + " tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", + "\n", + " def tokenize_function(examples):\n", + " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n", + "\n", + " tokenized_datasets = dataset.map(tokenize_function, batched=True)\n", + "\n", + " #using a fraction of dataset but you can run with the full dataset\n", + " small_train_dataset = tokenized_datasets[\"train\"].shuffle(seed=42).select(range(100))\n", + " small_eval_dataset = tokenized_datasets[\"test\"].shuffle(seed=42).select(range(100))\n", + "\n", + " print(f\"len of train {small_train_dataset} and test {small_eval_dataset}\")\n", + "\n", + " ray_train_ds = ray.data.from_huggingface(small_train_dataset)\n", + " ray_evaluation_ds = ray.data.from_huggingface(small_eval_dataset)\n", + "\n", + " def compute_metrics(eval_pred):\n", + " metric = load_metric(\"accuracy\")\n", + " logits, labels = eval_pred\n", + " predictions = np.argmax(logits, axis=-1)\n", + " return metric.compute(predictions=predictions, references=labels)\n", + "\n", + " def trainer_init_per_worker(train_dataset, eval_dataset, **config):\n", + " model = AutoModelForSequenceClassification.from_pretrained(\"distilbert-base-uncased\", num_labels=2)\n", + "\n", + " training_args = TrainingArguments(\"/tmp/hf_imdb/test\", eval_steps=1, disable_tqdm=True, \n", + " num_train_epochs=1, skip_memory_metrics=True,\n", + " learning_rate=2e-5,\n", + " per_device_train_batch_size=16,\n", + " per_device_eval_batch_size=16, \n", + " weight_decay=0.01,)\n", + " return transformers.Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=train_dataset,\n", + " eval_dataset=eval_dataset,\n", + " compute_metrics=compute_metrics\n", + " )\n", + "\n", + " scaling_config = ScalingConfig(num_workers=4, use_gpu=True) #num workers is the number of gpus\n", + "\n", + " # we are using the ray native HuggingFaceTrainer, but you can swap out to use non ray Huggingface Trainer. Both have the same method signature. \n", + " # the ray native HFTrainer has built in support for scaling to multiple GPUs\n", + " trainer = HuggingFaceTrainer(\n", + " trainer_init_per_worker=trainer_init_per_worker,\n", + " scaling_config=scaling_config,\n", + " datasets={\"train\": ray_train_ds, \"evaluation\": ray_evaluation_ds},\n", + " )\n", + " result = trainer.fit()\n" + ] + }, + { + "cell_type": "markdown", + "id": "f9593fee-2b2b-415f-8902-bceec014385f", + "metadata": {}, + "source": [ + "**NOTE:** This code will produce a lot of output and will run for **approximately 2 minutes.** As a part of execution it will download the `imdb` dataset, `distilbert-base-uncased` model and then will start transfer learning task for training the model with this dataset. " + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "7f0985e9-5e88-4d36-ab38-c3001c13f97c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading builder script: 100%|██████████| 4.31k/4.31k [00:00<00:00, 5.60MB/s]\n", + "Downloading metadata: 100%|██████████| 2.17k/2.17k [00:00<00:00, 3.13MB/s]\n", + "Downloading readme: 100%|██████████| 7.59k/7.59k [00:00<00:00, 9.75MB/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Downloading and preparing dataset imdb/plain_text to /home/ray/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading data: 0%| | 0.00/84.1M [00:00