|
9 | 9 | "source": [
|
10 | 10 | "# Import pieces from codeflare-sdk\n",
|
11 | 11 | "from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration\n",
|
12 |
| - "from codeflare_sdk.cluster.auth import TokenAuthentication" |
| 12 | + "from codeflare_sdk.cluster.auth import TokenAuthentication\n", |
| 13 | + "from codeflare_sdk.job.jobs import DDPJobDefinition" |
13 | 14 | ]
|
14 | 15 | },
|
15 | 16 | {
|
|
51 | 52 | }
|
52 | 53 | ],
|
53 | 54 | "source": [
|
54 |
| - "# Create our cluster and submit appwrapper\n", |
55 |
| - "cluster = Cluster(ClusterConfiguration(name='mnisttest', min_worker=2, max_worker=2, min_cpus=8, max_cpus=8, min_memory=16, max_memory=16, gpu=4, instascale=True, machine_types=[\"m5.xlarge\", \"p3.8xlarge\"]))" |
| 55 | + "# Create our cluster and submit appwrapper (reduce specs as desired)\n", |
| 56 | + "cluster = Cluster(ClusterConfiguration(\n", |
| 57 | + " name='mnisttest',\n", |
| 58 | + " namespace='default',\n", |
| 59 | + " min_worker=2,\n", |
| 60 | + " max_worker=2,\n", |
| 61 | + " min_cpus=8,\n", |
| 62 | + " max_cpus=8,\n", |
| 63 | + " min_memory=16,\n", |
| 64 | + " max_memory=16,\n", |
| 65 | + " gpu=4,\n", |
| 66 | + " instascale=True, # Can be set to false if scaling not needed\n", |
| 67 | + " machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"] # Can be removed if above is false\n", |
| 68 | + "))" |
56 | 69 | ]
|
57 | 70 | },
|
58 | 71 | {
|
|
293 | 306 | "Now that our resource cluster is ready, we can directly submit our batch job (model training on two workers with four gpus each) to the cluster via torchx."
|
294 | 307 | ]
|
295 | 308 | },
|
296 |
| - { |
297 |
| - "cell_type": "code", |
298 |
| - "execution_count": 9, |
299 |
| - "id": "3cc6183a-8f6e-4347-af91-d088ed422544", |
300 |
| - "metadata": {}, |
301 |
| - "outputs": [], |
302 |
| - "source": [ |
303 |
| - "from codeflare_sdk.job.jobs import DDPJobDefinition" |
304 |
| - ] |
305 |
| - }, |
306 | 309 | {
|
307 | 310 | "cell_type": "code",
|
308 | 311 | "execution_count": 10,
|
|
320 | 323 | }
|
321 | 324 | ],
|
322 | 325 | "source": [
|
323 |
| - "job = DDPJobDefinition(script=\"mnist.py\", scheduler_args={\"requirements\": \"requirements.txt\"}).submit(cluster)" |
| 326 | + "job = DDPJobDefinition(\n", |
| 327 | + " script=\"mnist.py\",\n", |
| 328 | + " scheduler_args={\"requirements\": \"requirements.txt\"}\n", |
| 329 | + ")\n", |
| 330 | + "job.submit(cluster)" |
324 | 331 | ]
|
325 | 332 | },
|
326 | 333 | {
|
|
0 commit comments