|
| 1 | +# Copyright 2020 - 2021 MONAI Consortium |
| 2 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 3 | +# you may not use this file except in compliance with the License. |
| 4 | +# You may obtain a copy of the License at |
| 5 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 6 | +# Unless required by applicable law or agreed to in writing, software |
| 7 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 8 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 9 | +# See the License for the specific language governing permissions and |
| 10 | +# limitations under the License. |
| 11 | + |
| 12 | +import os |
| 13 | +from os import listdir |
| 14 | +from os.path import isfile, join |
| 15 | +import numpy as np |
| 16 | +from xml.dom import minidom |
| 17 | +from PIL import Image |
| 18 | +import pandas as pd |
| 19 | +import xml.etree.ElementTree as ET |
| 20 | + |
| 21 | +def create_report(img_names_list_, report_list_, gt_list_, save_add): |
| 22 | + pd.DataFrame({'id': img_names_list_, 'report': report_list_, 'Atelectasis': gt_list_[:, 0], |
| 23 | + 'Cardiomegaly': gt_list_[:, 1], 'Consolidation': gt_list_[:, 2],'Edema': gt_list_[:, 3], |
| 24 | + 'Enlarged-Cardiomediastinum': gt_list_[:, 4], 'Fracture': gt_list_[:, 5], 'Lung-Lesion': gt_list_[:, 6], |
| 25 | + 'Lung-Opacity': gt_list_[:, 7], 'No-Finding': gt_list_[:, 8], 'Pleural-Effusion': gt_list_[:, 9], |
| 26 | + 'Pleural_Other': gt_list_[:, 10], 'Pneumonia': gt_list_[:, 11], 'Pneumothorax': gt_list_[:, 12], |
| 27 | + 'Support-Devices': gt_list_[:, 13]}).to_csv(save_add, index=False) |
| 28 | + |
| 29 | +report_file_add= './monai_data/dataset_orig/NLMCXR_reports/ecgen-radiology' |
| 30 | +img_file_add= './monai_data/dataset_orig/NLMCXR_png' |
| 31 | +img_save_add = './monai_data/dataset_proc/images' |
| 32 | +report_train_save_add = './monai_data/dataset_proc/train.csv' |
| 33 | +report_val_save_add = './monai_data/dataset_proc/validation.csv' |
| 34 | +report_test_save_add = './monai_data/dataset_proc/test.csv' |
| 35 | + |
| 36 | +if not os.path.isdir(img_save_add): |
| 37 | + os.makedirs(img_save_add) |
| 38 | +report_files = [f for f in listdir(report_file_add) if isfile(join(report_file_add, f))] |
| 39 | + |
| 40 | +train_data = np.load('./train.npy', allow_pickle=True).item() |
| 41 | +train_data_id = train_data['id_GT'] |
| 42 | +train_data_gt = train_data['label_GT'] |
| 43 | + |
| 44 | +val_data = np.load('./validation.npy', allow_pickle=True).item() |
| 45 | +val_data_id = val_data['id_GT'] |
| 46 | +val_data_gt = val_data['label_GT'] |
| 47 | + |
| 48 | +test_data = np.load('./test.npy', allow_pickle=True).item() |
| 49 | +test_data_id = test_data['id_GT'] |
| 50 | +test_data_gt = test_data['label_GT'] |
| 51 | + |
| 52 | +all_cases = np.union1d(np.union1d(train_data_id, val_data_id), test_data_id) |
| 53 | + |
| 54 | +img_names_list_train = [] |
| 55 | +img_names_list_val = [] |
| 56 | +img_names_list_test = [] |
| 57 | + |
| 58 | +report_list_train = [] |
| 59 | +report_list_val = [] |
| 60 | +report_list_test = [] |
| 61 | + |
| 62 | +gt_list_train = [] |
| 63 | +gt_list_val = [] |
| 64 | +gt_list_test = [] |
| 65 | + |
| 66 | +for file in report_files: |
| 67 | + print('Processing {}'.format(file)) |
| 68 | + add_xml = os.path.join(report_file_add, file) |
| 69 | + docs = minidom.parse(add_xml) |
| 70 | + tree = ET.parse(add_xml) |
| 71 | + for node in tree.iter('AbstractText'): |
| 72 | + i = 0 |
| 73 | + for elem in node.iter(): |
| 74 | + if elem.attrib['Label'] == "FINDINGS": |
| 75 | + if elem.text == None: |
| 76 | + report = "FINDINGS : " |
| 77 | + else: |
| 78 | + report = "FINDINGS : " + elem.text |
| 79 | + elif elem.attrib['Label'] == "IMPRESSION": |
| 80 | + if elem.text == None: |
| 81 | + report = report + " IMPRESSION : " |
| 82 | + else: |
| 83 | + report = report + " IMPRESSION : " + elem.text |
| 84 | + images = docs.getElementsByTagName("parentImage") |
| 85 | + for i in images: |
| 86 | + img_name = i.getAttribute("id") + '.png' |
| 87 | + if img_name in all_cases: |
| 88 | + Image.open(os.path.join(img_file_add, img_name)).resize((512, 512)).save( |
| 89 | + os.path.join(img_save_add, img_name)) |
| 90 | + if img_name in train_data_id: |
| 91 | + img_names_list_train.append(img_name) |
| 92 | + report_list_train.append(report) |
| 93 | + gt_list_train.append(train_data_gt[np.where(train_data_id==img_name)[0][0]]) |
| 94 | + elif img_name in val_data_id: |
| 95 | + img_names_list_val.append(img_name) |
| 96 | + report_list_val.append(report) |
| 97 | + gt_list_val.append(val_data_gt[np.where(val_data_id == img_name)[0][0]]) |
| 98 | + elif img_name in test_data_id: |
| 99 | + img_names_list_test.append(img_name) |
| 100 | + report_list_test.append(report) |
| 101 | + gt_list_test.append(test_data_gt[np.where(test_data_id == img_name)[0][0]]) |
| 102 | + |
| 103 | +datasets = [{"save_add": report_train_save_add, |
| 104 | + "img_name": np.array(img_names_list_train), |
| 105 | + "report": np.array(report_list_train), |
| 106 | + "gt": np.array(gt_list_train)}, |
| 107 | + {"save_add": report_val_save_add, |
| 108 | + "img_name": np.array(img_names_list_val), |
| 109 | + "report": np.array(report_list_val), |
| 110 | + "gt": np.array(gt_list_val)}, |
| 111 | + {"save_add": report_test_save_add, |
| 112 | + "img_name": np.array(img_names_list_test), |
| 113 | + "report": np.array(report_list_test), |
| 114 | + "gt": np.array(gt_list_test)} |
| 115 | + ] |
| 116 | +for dataset in datasets: |
| 117 | + create_report(dataset["img_name"], dataset["report"], dataset["gt"], dataset["save_add"]) |
| 118 | + |
| 119 | +print('Processed Dataset Files Are Saved !') |
0 commit comments