From f8abbf7d9bdd5a7d2962df29d76548bc3fc76d0f Mon Sep 17 00:00:00 2001 From: Vicky Tsang Date: Mon, 13 Mar 2023 11:50:56 -0700 Subject: [PATCH 1/4] [WIP] enable AMD GPU Signed-off-by: Vicky Tsang --- monai/deploy/packager/util.py | 26 ++++++++++++++++++++------ monai/deploy/runner/runner.py | 6 +++++- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/monai/deploy/packager/util.py b/monai/deploy/packager/util.py index 3db72128..ffce41c2 100644 --- a/monai/deploy/packager/util.py +++ b/monai/deploy/packager/util.py @@ -42,7 +42,11 @@ def verify_base_image(base_image: str) -> str: str: returns string identifier of the dockerfile template to build MAP if valid base image provided, returns empty string otherwise """ - valid_prefixes = {"nvcr.io/nvidia/cuda": "ubuntu", "nvcr.io/nvidia/pytorch": "pytorch"} + import torch + if "AMD" not in torch.cuda.get_device_name(0): + valid_prefixes = {"nvcr.io/nvidia/cuda": "ubuntu", "nvcr.io/nvidia/pytorch": "pytorch"} + else: + valid_prefixes = {"rocm": "ubuntu", "rocm/pytorch": "pytorch"} for prefix, template in valid_prefixes.items(): if prefix in base_image: @@ -89,12 +93,22 @@ def initialize_args(args: Namespace) -> Dict: if args.base: dockerfile_type = verify_base_image(args.base) if not dockerfile_type: - logger.error( - "Provided base image '{}' is not supported \n \ - Please provide a Cuda or Pytorch image from https://ngc.nvidia.com/ (nvcr.io/nvidia)".format( - args.base + import torch + if "AMD" not in torch.cuda.get_device_name(0): + logger.error( + "Provided base image '{}' is not supported \n \ + Please provide a Cuda or Pytorch image from https://ngc.nvidia.com/ (nvcr.io/nvidia)".format( + args.base + ) ) - ) + else: + logger.error( + "Provided base image '{}' is not supported \n \ + Please provide a ROCm or Pytorch image from https://hub.docker.com/r/rocm/pytorch".format( + args.base + ) + ) + sys.exit(1) processed_args["dockerfile_type"] = dockerfile_type if args.base else DefaultValues.DOCKERFILE_TYPE diff --git a/monai/deploy/runner/runner.py b/monai/deploy/runner/runner.py index ee3e183b..9a145e48 100644 --- a/monai/deploy/runner/runner.py +++ b/monai/deploy/runner/runner.py @@ -87,7 +87,9 @@ def run_app(map_name: str, input_path: Path, output_path: Path, app_info: dict, # Use nvidia-docker if GPU resources are requested requested_gpus = get_requested_gpus(pkg_info) if requested_gpus > 0: - cmd = "nvidia-docker run --rm -a STDERR" + import torch + if "AMD" not in torch.cuda.get_device_name(0): + cmd = "nvidia-docker run --rm -a STDERR" if not quiet: cmd += " -a STDOUT" @@ -160,6 +162,8 @@ def pkg_specific_dependency_verification(pkg_info: dict) -> bool: """ requested_gpus = get_requested_gpus(pkg_info) if requested_gpus > 0: + import torch + if "AMD" not in torch.cuda.get_device_name(0): # check for nvidia-docker prog = "nvidia-docker" logger.info('--> Verifying if "%s" is installed...\n', prog) From aaf14cccd487b4d54e782e8ce74d8c5999dddd91 Mon Sep 17 00:00:00 2001 From: Vicky Tsang Date: Fri, 24 Mar 2023 12:31:05 -0700 Subject: [PATCH 2/4] check for AMD GPU device and rocm installation with rocminfo Signed-off-by: Vicky Tsang --- monai/deploy/packager/util.py | 15 +++++++-------- monai/deploy/runner/runner.py | 21 ++++++++++---------- monai/deploy/utils/deviceutil.py | 33 ++++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 19 deletions(-) create mode 100644 monai/deploy/utils/deviceutil.py diff --git a/monai/deploy/packager/util.py b/monai/deploy/packager/util.py index ffce41c2..6506895e 100644 --- a/monai/deploy/packager/util.py +++ b/monai/deploy/packager/util.py @@ -23,6 +23,7 @@ from monai.deploy.exceptions import WrongValueError from monai.deploy.packager.constants import DefaultValues from monai.deploy.packager.templates import Template +from monai.deploy.utils.deviceutil import has_rocm from monai.deploy.utils.fileutil import checksum from monai.deploy.utils.importutil import dist_module_path, dist_requires, get_application from monai.deploy.utils.spinner import ProgressSpinner @@ -42,11 +43,10 @@ def verify_base_image(base_image: str) -> str: str: returns string identifier of the dockerfile template to build MAP if valid base image provided, returns empty string otherwise """ - import torch - if "AMD" not in torch.cuda.get_device_name(0): - valid_prefixes = {"nvcr.io/nvidia/cuda": "ubuntu", "nvcr.io/nvidia/pytorch": "pytorch"} - else: + if has_rocm(): valid_prefixes = {"rocm": "ubuntu", "rocm/pytorch": "pytorch"} + else: + valid_prefixes = {"nvcr.io/nvidia/cuda": "ubuntu", "nvcr.io/nvidia/pytorch": "pytorch"} for prefix, template in valid_prefixes.items(): if prefix in base_image: @@ -93,18 +93,17 @@ def initialize_args(args: Namespace) -> Dict: if args.base: dockerfile_type = verify_base_image(args.base) if not dockerfile_type: - import torch - if "AMD" not in torch.cuda.get_device_name(0): + if has_rocm(): logger.error( "Provided base image '{}' is not supported \n \ - Please provide a Cuda or Pytorch image from https://ngc.nvidia.com/ (nvcr.io/nvidia)".format( + Please provide a ROCm or Pytorch image from https://hub.docker.com/r/rocm/pytorch".format( args.base ) ) else: logger.error( "Provided base image '{}' is not supported \n \ - Please provide a ROCm or Pytorch image from https://hub.docker.com/r/rocm/pytorch".format( + Please provide a Cuda or Pytorch image from https://ngc.nvidia.com/ (nvcr.io/nvidia)".format( args.base ) ) diff --git a/monai/deploy/runner/runner.py b/monai/deploy/runner/runner.py index 9a145e48..d521bf30 100644 --- a/monai/deploy/runner/runner.py +++ b/monai/deploy/runner/runner.py @@ -20,6 +20,7 @@ from typing import Tuple from monai.deploy.runner.utils import get_requested_gpus, run_cmd, verify_image +from monai.deploy.utils.deviceutil import has_rocm logger = logging.getLogger("app_runner") @@ -87,9 +88,8 @@ def run_app(map_name: str, input_path: Path, output_path: Path, app_info: dict, # Use nvidia-docker if GPU resources are requested requested_gpus = get_requested_gpus(pkg_info) if requested_gpus > 0: - import torch - if "AMD" not in torch.cuda.get_device_name(0): - cmd = "nvidia-docker run --rm -a STDERR" + if not has_rocm(): + cmd = "nvidia-docker run --rm -a STDERR" if not quiet: cmd += " -a STDOUT" @@ -162,14 +162,13 @@ def pkg_specific_dependency_verification(pkg_info: dict) -> bool: """ requested_gpus = get_requested_gpus(pkg_info) if requested_gpus > 0: - import torch - if "AMD" not in torch.cuda.get_device_name(0): - # check for nvidia-docker - prog = "nvidia-docker" - logger.info('--> Verifying if "%s" is installed...\n', prog) - if not shutil.which(prog): - logger.error('ERROR: "%s" not installed, please install nvidia-docker.', prog) - return False + if not has_rocm(): + # check for nvidia-docker + prog = "nvidia-docker" + logger.info('--> Verifying if "%s" is installed...\n', prog) + if not shutil.which(prog): + logger.error('ERROR: "%s" not installed, please install nvidia-docker.', prog) + return False return True diff --git a/monai/deploy/utils/deviceutil.py b/monai/deploy/utils/deviceutil.py new file mode 100644 index 00000000..65645349 --- /dev/null +++ b/monai/deploy/utils/deviceutil.py @@ -0,0 +1,33 @@ +# Copyright 2023 MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import subprocess + + +def has_rocm(): + """Return True if ROCm is installed and GPU device is detected. + + Args: + + Returns: + True if ROCm is installed and GPU device is detected, otherwise False. + """ + cmd = "rocminfo" + process = subprocess.run([cmd], stdout=subprocess.PIPE) + for line_in in process.stdout.decode().splitlines(): + if "Device Type" in line_in and "GPU" in line_in: + return True + + return False + + +if __name__ == "__main__": + print(has_rocm()) From 0e475f4d93a610e15c399b3777814f566c4be9ec Mon Sep 17 00:00:00 2001 From: Vicky Tsang Date: Fri, 24 Mar 2023 12:34:14 -0700 Subject: [PATCH 3/4] update docs/tutorials with AMD GPU/rocm references Signed-off-by: Vicky Tsang --- docs/source/getting_started/tutorials/mednist_app.md | 8 ++++++++ docs/source/getting_started/tutorials/monai_bundle_app.md | 4 ++++ docs/source/getting_started/tutorials/multi_model_app.md | 4 ++++ docs/source/getting_started/tutorials/segmentation_app.md | 4 ++++ 4 files changed, 20 insertions(+) diff --git a/docs/source/getting_started/tutorials/mednist_app.md b/docs/source/getting_started/tutorials/mednist_app.md index 539d98fe..fa7e3605 100644 --- a/docs/source/getting_started/tutorials/mednist_app.md +++ b/docs/source/getting_started/tutorials/mednist_app.md @@ -88,6 +88,14 @@ monai-deploy package examples/apps/mednist_classifier_monaideploy/mednist_classi --model classifier.zip \ -l DEBUG +# For AMD GPUs, nvidia-docker is not required. Use --base [base image] option to override the docker base image. +# Please see https://hub.docker.com/r/rocm/pytorch for rocm/pytorch docker images. +monai-deploy package -b rocm/pytorch:rocm5.4.1_ubuntu20.04_py3.7_pytorch_1.12.1 \ + examples/apps/mednist_classifier_monaideploy/mednist_classifier_monaideploy.py \ + --tag mednist_app:latest \ + --model classifier.zip \ + -l DEBUG + # Run the app with docker image and input file locally monai-deploy run mednist_app:latest input output cat output/output.json diff --git a/docs/source/getting_started/tutorials/monai_bundle_app.md b/docs/source/getting_started/tutorials/monai_bundle_app.md index bc47b1b5..986e1ff2 100644 --- a/docs/source/getting_started/tutorials/monai_bundle_app.md +++ b/docs/source/getting_started/tutorials/monai_bundle_app.md @@ -66,6 +66,10 @@ monai-deploy exec ../examples/apps/ai_spleen_seg_app/app.py -i dcm/ -o output -m # Please see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#docker to install nvidia-docker2. monai-deploy package examples/apps/ai_spleen_seg_app --tag seg_app:latest --model model.ts -l DEBUG +# For AMD GPUs, nvidia-docker is not required. Use --base [base image] option to override the docker base image. +# Please see https://hub.docker.com/r/rocm/pytorch for rocm/pytorch docker images. +monai-deploy package -b rocm/pytorch:rocm5.4.1_ubuntu20.04_py3.7_pytorch_1.12.1 examples/apps/ai_spleen_seg_app --tag seg_app:latest --model model.ts -l DEBUG + # Run the app with docker image and input file locally monai-deploy run seg_app:latest dcm/ output ``` diff --git a/docs/source/getting_started/tutorials/multi_model_app.md b/docs/source/getting_started/tutorials/multi_model_app.md index 44b2bcbd..307cb6f1 100644 --- a/docs/source/getting_started/tutorials/multi_model_app.md +++ b/docs/source/getting_started/tutorials/multi_model_app.md @@ -66,6 +66,10 @@ monai-deploy exec ../examples/apps/examples/apps/ai_multi_ai_app/app.py -i dcm/ # Please see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#docker to install nvidia-docker2. monai-deploy package -b nvcr.io/nvidia/pytorch:22.08-py3 examples/apps/ai_multi_ai_app --tag multi_model_app:latest --model multi_models -l DEBUG +# For AMD GPUs, nvidia-docker is not required. Use --base [base image] option to override the docker base image. +# Please see https://hub.docker.com/r/rocm/pytorch for rocm/pytorch docker images. +monai-deploy package -b rocm/pytorch:rocm5.4.1_ubuntu20.04_py3.7_pytorch_1.12.1 examples/apps/ai_multi_ai_app --tag multi_model_app:latest --model multi_models -l DEBUG + # Run the app with docker image and input file locally monai-deploy run multi_model_app:latest dcm/ output ``` diff --git a/docs/source/getting_started/tutorials/segmentation_app.md b/docs/source/getting_started/tutorials/segmentation_app.md index 2905729d..9ee72515 100644 --- a/docs/source/getting_started/tutorials/segmentation_app.md +++ b/docs/source/getting_started/tutorials/segmentation_app.md @@ -76,6 +76,10 @@ python examples/apps/ai_spleen_seg_app/app.py -i dcm/ -o output -m model.ts # Please see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#docker to install nvidia-docker2. monai-deploy package examples/apps/ai_spleen_seg_app --tag seg_app:latest --model model.ts -l DEBUG +# For AMD GPUs, nvidia-docker is not required. Use --base [base image] option to override the docker base image. +# Please see https://hub.docker.com/r/rocm/pytorch for rocm/pytorch docker images. +monai-deploy package -b rocm/pytorch:rocm5.4.1_ubuntu20.04_py3.7_pytorch_1.12.1 examples/apps/ai_spleen_seg_app --tag seg_app:latest --model model.ts -l DEBUG + # Run the app with docker image and input file locally monai-deploy run seg_app:latest dcm/ output ``` From 4989e6e7b6c9d6d6d22b8200a45a074a14729c1d Mon Sep 17 00:00:00 2001 From: Vicky Tsang Date: Tue, 28 Mar 2023 19:03:18 -0700 Subject: [PATCH 4/4] remove rocm dependency in packager Signed-off-by: Vicky Tsang --- monai/deploy/packager/util.py | 24 ++++++++---------------- monai/deploy/utils/deviceutil.py | 11 +++++++---- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/monai/deploy/packager/util.py b/monai/deploy/packager/util.py index 6506895e..2249775b 100644 --- a/monai/deploy/packager/util.py +++ b/monai/deploy/packager/util.py @@ -23,7 +23,6 @@ from monai.deploy.exceptions import WrongValueError from monai.deploy.packager.constants import DefaultValues from monai.deploy.packager.templates import Template -from monai.deploy.utils.deviceutil import has_rocm from monai.deploy.utils.fileutil import checksum from monai.deploy.utils.importutil import dist_module_path, dist_requires, get_application from monai.deploy.utils.spinner import ProgressSpinner @@ -43,8 +42,8 @@ def verify_base_image(base_image: str) -> str: str: returns string identifier of the dockerfile template to build MAP if valid base image provided, returns empty string otherwise """ - if has_rocm(): - valid_prefixes = {"rocm": "ubuntu", "rocm/pytorch": "pytorch"} + if "rocm" in base_image: + valid_prefixes = {"rocm/pytorch": "ubuntu"} else: valid_prefixes = {"nvcr.io/nvidia/cuda": "ubuntu", "nvcr.io/nvidia/pytorch": "pytorch"} @@ -93,20 +92,13 @@ def initialize_args(args: Namespace) -> Dict: if args.base: dockerfile_type = verify_base_image(args.base) if not dockerfile_type: - if has_rocm(): - logger.error( - "Provided base image '{}' is not supported \n \ - Please provide a ROCm or Pytorch image from https://hub.docker.com/r/rocm/pytorch".format( - args.base - ) - ) - else: - logger.error( - "Provided base image '{}' is not supported \n \ - Please provide a Cuda or Pytorch image from https://ngc.nvidia.com/ (nvcr.io/nvidia)".format( - args.base - ) + logger.error( + "Provided base image '{}' is not supported \n \ + Please provide a ROCm or Cuda based Pytorch image from \n \ + https://hub.docker.com/r/rocm/pytorch or https://ngc.nvidia.com/ (nvcr.io/nvidia)".format( + args.base ) + ) sys.exit(1) diff --git a/monai/deploy/utils/deviceutil.py b/monai/deploy/utils/deviceutil.py index 65645349..6a62e33b 100644 --- a/monai/deploy/utils/deviceutil.py +++ b/monai/deploy/utils/deviceutil.py @@ -21,10 +21,13 @@ def has_rocm(): True if ROCm is installed and GPU device is detected, otherwise False. """ cmd = "rocminfo" - process = subprocess.run([cmd], stdout=subprocess.PIPE) - for line_in in process.stdout.decode().splitlines(): - if "Device Type" in line_in and "GPU" in line_in: - return True + try: + process = subprocess.run([cmd], stdout=subprocess.PIPE) + for line_in in process.stdout.decode().splitlines(): + if "Device Type" in line_in and "GPU" in line_in: + return True + except Exception: + pass return False