Skip to content

Commit 8a9070a

Browse files
Add TransCheX Tutorial (#488)
* Add TransCheX Tutorial Signed-off-by: ahatamizadeh <ahatamizadeh@nvidia.com> * Add TransCheX Tutorial Signed-off-by: ahatamizadeh <ahatamizadeh@nvidia.com> * Add TransCheX Tutorial Signed-off-by: ahatamizadeh <ahatamizadeh@nvidia.com> * Add TransCheX Tutorial Signed-off-by: ahatamizadeh <ahatamizadeh@nvidia.com> * Add TransCheX Tutorial Signed-off-by: ahatamizadeh <ahatamizadeh@nvidia.com> * Add TransCheX Tutorial Signed-off-by: ahatamizadeh <ahatamizadeh@nvidia.com> * Add TransCheX Tutorial Signed-off-by: ahatamizadeh <ahatamizadeh@nvidia.com> * Add TransCheX Tutorial Signed-off-by: ahatamizadeh <ahatamizadeh@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 4e1e316 commit 8a9070a

File tree

4 files changed

+736
-0
lines changed

4 files changed

+736
-0
lines changed
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Preprocessing Open-I Dataset
2+
3+
The Open-I dataset provides a collection of 3,996 radiology reports
4+
with 8,121 associated images in PA, AP and lateral views. In this tutorial, we utilize the images from fronal view with their corresponding reports for training and
5+
evaluation of the TransChex model. The chest x-ray images and reports are originally from the Indiana University hospital (see the licencing information below).
6+
The 14 finding categories in this work include Atelectasis, Cardiomegaly, Consolidation, Edema, Enlarged-Cardiomediastinum, Fracture, Lung-Lesion, Lung-Opacity, No-Finding, Pleural-Effusion, Pleural-Other, Pneumonia, Pneumothorax and Support-Devices. More information can be found in the following link:
7+
https://openi.nlm.nih.gov/faq
8+
9+
License: Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0)
10+
11+
In this section, we provide the steps that are needed for preprocessing the Open-I dataset for
12+
the multi-label disease classification tutorial using TransCheX model. As a result, once the following steps are
13+
completed, the dataset can be readily used for the tutorial.
14+
15+
### Preprocessing Steps
16+
1) Create a new folder named 'monai_data' for downloading the raw data and preprocessing.
17+
2) Download the chest X-ray images in PNG format from this [link](https://openi.nlm.nih.gov/imgs/collections/NLMCXR_png.tgz). Copy the downloaded file (NLMCXR_png.tgz)
18+
to 'monai_data' directory and extract it.
19+
3) Download the reports in XML format from this [link](https://openi.nlm.nih.gov/imgs/collections/NLMCXR_reports.tgz). Copy the downloaded file (NLMCXR_reports.tgz)
20+
to 'monai_data' directory and extract it.
21+
4) Download the splits of train, validation and test datasets from this [link](https://drive.google.com/u/1/uc?id=1jvT0jVl9mgtWy4cS7LYbF43bQE4mrXAY&export=download). Copy the downloaded file (TransChex_openi.zip)
22+
to 'monai_data' directory and extract it.
23+
5) Run 'preprocess_openi.py' to process the images and reports.
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
# Copyright 2020 - 2021 MONAI Consortium
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
# http://www.apache.org/licenses/LICENSE-2.0
6+
# Unless required by applicable law or agreed to in writing, software
7+
# distributed under the License is distributed on an "AS IS" BASIS,
8+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9+
# See the License for the specific language governing permissions and
10+
# limitations under the License.
11+
12+
import os
13+
from os import listdir
14+
from os.path import isfile, join
15+
import numpy as np
16+
from xml.dom import minidom
17+
from PIL import Image
18+
import pandas as pd
19+
import xml.etree.ElementTree as ET
20+
21+
def create_report(img_names_list_, report_list_, gt_list_, save_add):
22+
pd.DataFrame({'id': img_names_list_, 'report': report_list_, 'Atelectasis': gt_list_[:, 0],
23+
'Cardiomegaly': gt_list_[:, 1], 'Consolidation': gt_list_[:, 2],'Edema': gt_list_[:, 3],
24+
'Enlarged-Cardiomediastinum': gt_list_[:, 4], 'Fracture': gt_list_[:, 5], 'Lung-Lesion': gt_list_[:, 6],
25+
'Lung-Opacity': gt_list_[:, 7], 'No-Finding': gt_list_[:, 8], 'Pleural-Effusion': gt_list_[:, 9],
26+
'Pleural_Other': gt_list_[:, 10], 'Pneumonia': gt_list_[:, 11], 'Pneumothorax': gt_list_[:, 12],
27+
'Support-Devices': gt_list_[:, 13]}).to_csv(save_add, index=False)
28+
29+
report_file_add= './monai_data/dataset_orig/NLMCXR_reports/ecgen-radiology'
30+
img_file_add= './monai_data/dataset_orig/NLMCXR_png'
31+
img_save_add = './monai_data/dataset_proc/images'
32+
report_train_save_add = './monai_data/dataset_proc/train.csv'
33+
report_val_save_add = './monai_data/dataset_proc/validation.csv'
34+
report_test_save_add = './monai_data/dataset_proc/test.csv'
35+
36+
if not os.path.isdir(img_save_add):
37+
os.makedirs(img_save_add)
38+
report_files = [f for f in listdir(report_file_add) if isfile(join(report_file_add, f))]
39+
40+
train_data = np.load('./train.npy', allow_pickle=True).item()
41+
train_data_id = train_data['id_GT']
42+
train_data_gt = train_data['label_GT']
43+
44+
val_data = np.load('./validation.npy', allow_pickle=True).item()
45+
val_data_id = val_data['id_GT']
46+
val_data_gt = val_data['label_GT']
47+
48+
test_data = np.load('./test.npy', allow_pickle=True).item()
49+
test_data_id = test_data['id_GT']
50+
test_data_gt = test_data['label_GT']
51+
52+
all_cases = np.union1d(np.union1d(train_data_id, val_data_id), test_data_id)
53+
54+
img_names_list_train = []
55+
img_names_list_val = []
56+
img_names_list_test = []
57+
58+
report_list_train = []
59+
report_list_val = []
60+
report_list_test = []
61+
62+
gt_list_train = []
63+
gt_list_val = []
64+
gt_list_test = []
65+
66+
for file in report_files:
67+
print('Processing {}'.format(file))
68+
add_xml = os.path.join(report_file_add, file)
69+
docs = minidom.parse(add_xml)
70+
tree = ET.parse(add_xml)
71+
for node in tree.iter('AbstractText'):
72+
i = 0
73+
for elem in node.iter():
74+
if elem.attrib['Label'] == "FINDINGS":
75+
if elem.text == None:
76+
report = "FINDINGS : "
77+
else:
78+
report = "FINDINGS : " + elem.text
79+
elif elem.attrib['Label'] == "IMPRESSION":
80+
if elem.text == None:
81+
report = report + " IMPRESSION : "
82+
else:
83+
report = report + " IMPRESSION : " + elem.text
84+
images = docs.getElementsByTagName("parentImage")
85+
for i in images:
86+
img_name = i.getAttribute("id") + '.png'
87+
if img_name in all_cases:
88+
Image.open(os.path.join(img_file_add, img_name)).resize((512, 512)).save(
89+
os.path.join(img_save_add, img_name))
90+
if img_name in train_data_id:
91+
img_names_list_train.append(img_name)
92+
report_list_train.append(report)
93+
gt_list_train.append(train_data_gt[np.where(train_data_id==img_name)[0][0]])
94+
elif img_name in val_data_id:
95+
img_names_list_val.append(img_name)
96+
report_list_val.append(report)
97+
gt_list_val.append(val_data_gt[np.where(val_data_id == img_name)[0][0]])
98+
elif img_name in test_data_id:
99+
img_names_list_test.append(img_name)
100+
report_list_test.append(report)
101+
gt_list_test.append(test_data_gt[np.where(test_data_id == img_name)[0][0]])
102+
103+
datasets = [{"save_add": report_train_save_add,
104+
"img_name": np.array(img_names_list_train),
105+
"report": np.array(report_list_train),
106+
"gt": np.array(gt_list_train)},
107+
{"save_add": report_val_save_add,
108+
"img_name": np.array(img_names_list_val),
109+
"report": np.array(report_list_val),
110+
"gt": np.array(gt_list_val)},
111+
{"save_add": report_test_save_add,
112+
"img_name": np.array(img_names_list_test),
113+
"report": np.array(report_list_test),
114+
"gt": np.array(gt_list_test)}
115+
]
116+
for dataset in datasets:
117+
create_report(dataset["img_name"], dataset["report"], dataset["gt"], dataset["save_add"])
118+
119+
print('Processed Dataset Files Are Saved !')

0 commit comments

Comments
 (0)