Skip to content

Commit 882ec11

Browse files
committed
[BE] Move data download logic to download_data.py
Use `download_url_to_file` which is heavily inspired by https://github.com/pytorch/pytorch/blob/efb73fe8e4413a0d6db078e85c7ed7c91f05ca5d/torch/hub.py#L600 Skip slow/flaky downloads if `FILES_TO_RUN` is defined, but tutorial is not in this shard Add dcgan_tutorial (which has a massive 1Gb downloadable) and fgsm_tutorial to list of tutorials with optinal downloadable data
1 parent 309c889 commit 882ec11

File tree

2 files changed

+113
-21
lines changed

2 files changed

+113
-21
lines changed

.jenkins/download_data.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
#!/usr/bin/env python3
2+
import hashlib
3+
import os
4+
5+
from typing import Optional
6+
from urllib.request import urlopen, Request
7+
from pathlib import Path
8+
from zipfile import ZipFile
9+
10+
REPO_BASE_DIR = Path(__file__).absolute().parent.parent
11+
DATA_DIR = REPO_BASE_DIR / "_data"
12+
BEGINNER_DATA_DIR = REPO_BASE_DIR / "beginner_source" / "data"
13+
INTERMEDIATE_DATA_DIR = REPO_BASE_DIR / "intermediate_source" / "data"
14+
ADVANCED_DATA_DIR = REPO_BASE_DIR / "advanced_source" / "data"
15+
PROTOTYPE_DATA_DIR = REPO_BASE_DIR / "prototype_source" / "data"
16+
FILES_TO_RUN = os.getenv("FILES_TO_RUN")
17+
18+
19+
def download_url_to_file(url: str,
20+
dst: Optional[str] = None,
21+
prefix: Optional[Path] = None,
22+
sha256: Optional[str] = None) -> Path:
23+
dst = dst if dst is not None else Path(url).name
24+
dst = dst if prefix is None else str(prefix / dst)
25+
if Path(dst).exists():
26+
print(f"Skip downloading {url} as {dst} already exists")
27+
return Path(dst)
28+
file_size = None
29+
u = urlopen(Request(url, headers={"User-Agent": "tutorials.downloader"}))
30+
meta = u.info()
31+
if hasattr(meta, 'getheaders'):
32+
content_length = meta.getheaders("Content-Length")
33+
else:
34+
content_length = meta.get_all("Content-Length")
35+
if content_length is not None and len(content_length) > 0:
36+
file_size = int(content_length[0])
37+
sha256_sum = hashlib.sha256()
38+
with open(dst, "wb") as f:
39+
while True:
40+
buffer = u.read(32768)
41+
if len(buffer) == 0:
42+
break
43+
sha256_sum.update(buffer)
44+
f.write(buffer)
45+
digest = sha256_sum.hexdigest()
46+
if sha256 is not None and sha256 != digest:
47+
Path(dst).unlink()
48+
raise RuntimeError(f"Downloaded {url} has unexpected sha256sum {digest} should be {sha256}")
49+
print(f"Downloaded {url} sha256sum={digest} size={file_size}")
50+
return Path(dst)
51+
52+
53+
def unzip(archive: Path, tgt_dir: Path) -> None:
54+
with ZipFile(str(archive), "r") as zip_ref:
55+
zip_ref.extractall(str(tgt_dir))
56+
57+
58+
def download_hymenoptera_data():
59+
# transfer learning tutorial data
60+
z = download_url_to_file("https://download.pytorch.org/tutorial/hymenoptera_data.zip",
61+
prefix=DATA_DIR,
62+
sha256="fbc41b31d544714d18dd1230b1e2b455e1557766e13e67f9f5a7a23af7c02209",
63+
)
64+
unzip(z, BEGINNER_DATA_DIR)
65+
66+
67+
def download_nlp_data() -> None:
68+
# nlp tutorial data
69+
z = download_url_to_file("https://download.pytorch.org/tutorial/data.zip",
70+
prefix=DATA_DIR,
71+
sha256="fb317e80248faeb62dc25ef3390ae24ca34b94e276bbc5141fd8862c2200bff5",
72+
)
73+
# This will unzip all files in data.zip to intermediate_source/data/ folder
74+
unzip(z, INTERMEDIATE_DATA_DIR)
75+
76+
77+
def download_dcgan_data() -> None:
78+
# Download dataset for beginner_source/dcgan_faces_tutorial.py
79+
z = download_url_to_file("https://s3.amazonaws.com/pytorch-tutorial-assets/img_align_celeba.zip",
80+
prefix=DATA_DIR,
81+
sha256="46fb89443c578308acf364d7d379fe1b9efb793042c0af734b6112e4fd3a8c74",
82+
)
83+
unzip(z, BEGINNER_DATA_DIR)
84+
85+
86+
def download_lenet_mnist() -> None:
87+
# Download model for beginner_source/fgsm_tutorial.py
88+
download_url_to_file("https://docs.google.com/uc?export=download&id=1HJV2nUHJqclXQ8flKvcWmjZ-OU5DGatl",
89+
prefix=BEGINNER_DATA_DIR,
90+
dst="lenet_mnist_model.pth",
91+
sha256="cb5f8e578aef96d5c1a2cc5695e1aa9bbf4d0fe00d25760eeebaaac6ebc2edcb",
92+
)
93+
94+
95+
def main() -> None:
96+
DATA_DIR.mkdir(exist_ok=True)
97+
BEGINNER_DATA_DIR.mkdir(exist_ok=True)
98+
ADVANCED_DATA_DIR.mkdir(exist_ok=True)
99+
INTERMEDIATE_DATA_DIR.mkdir(exist_ok=True)
100+
PROTOTYPE_DATA_DIR.mkdir(exist_ok=True)
101+
102+
download_hymenoptera_data()
103+
download_nlp_data()
104+
if FILES_TO_RUN is None or "dcgan_faces_tutorial" in FILES_TO_RUN:
105+
download_dcgan_data()
106+
if FILES_TO_RUN is None or "fgsm_tutorial" in FILES_TO_RUN:
107+
download_lenet_mnist()
108+
109+
110+
if __name__ == "__main__":
111+
main()

Makefile

Lines changed: 2 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -38,20 +38,8 @@ download:
3838
# Step2-2. UNTAR: tar -xzf $(DATADIR)/[SOURCE_FILE] -C [*_source/data/]
3939
# Step2-3. AS-IS: cp $(DATADIR)/[SOURCE_FILE] [*_source/data/]
4040

41-
# make data directories
42-
mkdir -p $(DATADIR)
43-
mkdir -p advanced_source/data
44-
mkdir -p beginner_source/data
45-
mkdir -p intermediate_source/data
46-
mkdir -p prototype_source/data
47-
48-
# transfer learning tutorial data
49-
wget -nv -N https://download.pytorch.org/tutorial/hymenoptera_data.zip -P $(DATADIR)
50-
unzip $(ZIPOPTS) $(DATADIR)/hymenoptera_data.zip -d beginner_source/data/
51-
52-
# nlp tutorial data
53-
wget -nv -N https://download.pytorch.org/tutorial/data.zip -P $(DATADIR)
54-
unzip $(ZIPOPTS) $(DATADIR)/data.zip -d intermediate_source/ # This will unzip all files in data.zip to intermediate_source/data/ folder
41+
# Run structured downloads first (will also make directories
42+
python3 .jenkins/download_data.py
5543

5644
# data loader tutorial
5745
wget -nv -N https://download.pytorch.org/tutorial/faces.zip -P $(DATADIR)
@@ -65,10 +53,6 @@ download:
6553
mkdir -p advanced_source/data/images/
6654
cp -r _static/img/neural-style/ advanced_source/data/images/
6755

68-
# Download dataset for beginner_source/dcgan_faces_tutorial.py
69-
wget -nv -N https://s3.amazonaws.com/pytorch-tutorial-assets/img_align_celeba.zip -P $(DATADIR)
70-
unzip $(ZIPOPTS) $(DATADIR)/img_align_celeba.zip -d beginner_source/data/celeba
71-
7256
# Download dataset for beginner_source/hybrid_frontend/introduction_to_hybrid_frontend_tutorial.py
7357
wget -nv -N https://s3.amazonaws.com/pytorch-tutorial-assets/iris.data -P $(DATADIR)
7458
cp $(DATADIR)/iris.data beginner_source/data/
@@ -81,9 +65,6 @@ download:
8165
wget -nv -N https://s3.amazonaws.com/pytorch-tutorial-assets/UrbanSound8K.tar.gz -P $(DATADIR)
8266
tar $(TAROPTS) -xzf $(DATADIR)/UrbanSound8K.tar.gz -C ./beginner_source/data/
8367

84-
# Download model for beginner_source/fgsm_tutorial.py
85-
wget -nv 'https://docs.google.com/uc?export=download&id=1HJV2nUHJqclXQ8flKvcWmjZ-OU5DGatl' -O $(DATADIR)/lenet_mnist_model.pth
86-
cp $(DATADIR)/lenet_mnist_model.pth ./beginner_source/data/lenet_mnist_model.pth
8768

8869
# Download model for advanced_source/dynamic_quantization_tutorial.py
8970
wget -nv -N https://s3.amazonaws.com/pytorch-tutorial-assets/word_language_model_quantize.pth -P $(DATADIR)

0 commit comments

Comments
 (0)