From 4b3f926d2382720466b00c2cb060469a6886bf0f Mon Sep 17 00:00:00 2001 From: carolFrohlich Date: Tue, 29 Sep 2015 14:15:15 -0400 Subject: [PATCH 01/45] add resource multiproc plugin --- nipype/interfaces/base.py | 2 + nipype/pipeline/plugins/__init__.py | 3 + nipype/pipeline/plugins/callback_log.py | 20 +++ nipype/pipeline/plugins/multiproc.py | 163 ++++++++++++++++++++++++ 4 files changed, 188 insertions(+) create mode 100644 nipype/pipeline/plugins/callback_log.py diff --git a/nipype/interfaces/base.py b/nipype/interfaces/base.py index ac6b7b8af4..854fb44fe1 100644 --- a/nipype/interfaces/base.py +++ b/nipype/interfaces/base.py @@ -750,6 +750,8 @@ def __init__(self, **inputs): raise Exception('No input_spec in class: %s' % self.__class__.__name__) self.inputs = self.input_spec(**inputs) + self.memory = 1 + self.num_threads = 1 @classmethod def help(cls, returnhelp=False): diff --git a/nipype/pipeline/plugins/__init__.py b/nipype/pipeline/plugins/__init__.py index dac14301b2..cf392f0f77 100644 --- a/nipype/pipeline/plugins/__init__.py +++ b/nipype/pipeline/plugins/__init__.py @@ -9,6 +9,7 @@ from .condor import CondorPlugin from .dagman import CondorDAGManPlugin from .multiproc import MultiProcPlugin +from .multiproc import ResourceMultiProcPlugin from .ipython import IPythonPlugin from .somaflow import SomaFlowPlugin from .pbsgraph import PBSGraphPlugin @@ -16,3 +17,5 @@ from .lsf import LSFPlugin from .slurm import SLURMPlugin from .slurmgraph import SLURMGraphPlugin + +from .callback_log import log_nodes_cb diff --git a/nipype/pipeline/plugins/callback_log.py b/nipype/pipeline/plugins/callback_log.py new file mode 100644 index 0000000000..cd8827bf29 --- /dev/null +++ b/nipype/pipeline/plugins/callback_log.py @@ -0,0 +1,20 @@ +import logging +import datetime + +def log_nodes_cb(node, status): + if status == 'start': + print 'START', "name:",node.name, "id:", node._id, "start:", datetime.datetime.now(), "memory:", node._interface.memory,"num_threads:", node._interface.num_threads + logging.debug( + "name:",node.name, + "id:", node._id, + "start:", datetime.datetime.now(), + "memory:", node._interface.memory, + "num_threads:", node._interface.num_threads) + else: + print 'FINISH', "name:",node.name, "id:", node._id, "finish:", datetime.datetime.now(), "memory:", node._interface.memory,"num_threads:", node._interface.num_threads + logging.debug( + "name:",node.name, + "id:", node._id, + "finish:", datetime.datetime.now(), + "memory:", node._interface.memory, + "num_threads:", node._interface.num_threads) \ No newline at end of file diff --git a/nipype/pipeline/plugins/multiproc.py b/nipype/pipeline/plugins/multiproc.py index 0f6b11c30a..51de405542 100644 --- a/nipype/pipeline/plugins/multiproc.py +++ b/nipype/pipeline/plugins/multiproc.py @@ -96,3 +96,166 @@ def _report_crash(self, node, result=None): def _clear_task(self, taskid): del self._taskresult[taskid] + + + + + +import numpy as np +from copy import deepcopy +from ..engine import (MapNode, str2bool) +import datetime +import psutil + +class ResourceMultiProcPlugin(MultiProcPlugin): + + def __init__(self, plugin_args=None): + super(ResourceMultiProcPlugin, self).__init__(plugin_args=plugin_args) + self.plugin_args = plugin_args + self.current_time = datetime.datetime.now() + self.log_nodes = [] + + def _send_procs_to_workers(self, updatehash=False, graph=None): + executing_now = [] + processors = cpu_count() + memory = psutil.virtual_memory() + memory = memory.total + if self.plugin_args: + if 'n_procs' in self.plugin_args: + processors = self.plugin_args['n_procs'] + if 'memory' in self.plugin_args: + memory = self.plugin_args['memory'] + + + jobids = np.flatnonzero((self.proc_pending == True) & (self.depidx.sum(axis=0) == 0).__array__()) + print('START, pending_tasks:', jobids) + + #busy_processors = number of busy processors + busy_memory = 0 + busy_processors = 0 + for jobid in jobids: + print 'using memory:', jobid, self.procs[jobid]._interface.num_threads + busy_memory+= self.procs[jobid]._interface.memory + busy_processors+= self.procs[jobid]._interface.num_threads + + + free_memory = memory - busy_memory + free_processors = processors - busy_processors + + #jobids = all jobs without dependency not run + jobids = np.flatnonzero((self.proc_done == False) & (self.depidx.sum(axis=0) == 0).__array__()) + + + #sort jobids first by memory and then by number of threads + jobids = sorted(jobids, key=lambda item: (self.procs[item]._interface.memory, self.procs[item]._interface.num_threads)) + print('jobids ->', jobids) + + print 'free memory ->', free_memory, ', free processors ->', free_processors + + + #while have enough memory and processors for first job + #submit first job on the list + for jobid in jobids: + print 'next_job ->', jobid, 'memory:', self.procs[jobid]._interface.memory, 'threads:', self.procs[jobid]._interface.num_threads + + print 'can job execute?', self.procs[jobid]._interface.memory <= free_memory and self.procs[jobid]._interface.num_threads <= free_processors + if self.procs[jobid]._interface.memory <= free_memory and self.procs[jobid]._interface.num_threads <= free_processors: + print('Executing: %s ID: %d' %(self.procs[jobid]._id, jobid)) + executing_now.append(self.procs[jobid]) + + if isinstance(self.procs[jobid], MapNode): + try: + num_subnodes = self.procs[jobid].num_subnodes() + except Exception: + self._clean_queue(jobid, graph) + self.proc_pending[jobid] = False + continue + if num_subnodes > 1: + submit = self._submit_mapnode(jobid) + if not submit: + continue + + + self.proc_done[jobid] = True + self.proc_pending[jobid] = True + + free_memory -= self.procs[jobid]._interface.memory + free_processors -= self.procs[jobid]._interface.num_threads + + if self._status_callback: + self._status_callback(self.procs[jobid], 'start') + + + + if str2bool(self.procs[jobid].config['execution']['local_hash_check']): + print('checking hash locally') + try: + hash_exists, _, _, _ = self.procs[ + jobid].hash_exists() + print('Hash exists %s' % str(hash_exists)) + if (hash_exists and (self.procs[jobid].overwrite == False or (self.procs[jobid].overwrite == None and not self.procs[jobid]._interface.always_run))): + self._task_finished_cb(jobid) + self._remove_node_dirs() + continue + except Exception: + self._clean_queue(jobid, graph) + self.proc_pending[jobid] = False + continue + + + print('Finished checking hash') + + + if self.procs[jobid].run_without_submitting: + print('Running node %s on master thread' %self.procs[jobid]) + try: + self.procs[jobid].run() + except Exception: + self._clean_queue(jobid, graph) + self._task_finished_cb(jobid) + self._remove_node_dirs() + + else: + print('submitting', jobid) + tid = self._submit_job(deepcopy(self.procs[jobid]), updatehash=updatehash) + if tid is None: + self.proc_done[jobid] = False + self.proc_pending[jobid] = False + else: + self.pending_tasks.insert(0, (tid, jobid)) + else: + break + + #run this code when not running each node + # max_node = datetime.datetime.min + # for n in executing_now: + # name = n.name + # start = self.current_time + # finish = self.current_time + n._interface.time + # duration = (finish - start).total_seconds() + # memory = n._interface.memory + # num_threads = n._interface.num_threads + + # if finish > max_node: + # max_node = finish + + # self.log_nodes.append({'name': name, 'start': str(start), 'finish': str(finish), 'duration': duration, 'memory':memory, 'num_threads': num_threads}) + + + # if len(executing_now) > 0: + # self.current_time = finish + # #write log + # self.log_nodes = sorted(self.log_nodes, key=lambda n: datetime.datetime.strptime(n['start'],"%Y-%m-%d %H:%M:%S.%f")) + # first_node = datetime.datetime.strptime(self.log_nodes[0]['start'],"%Y-%m-%d %H:%M:%S.%f") + # last_node = datetime.datetime.strptime(self.log_nodes[-1]['finish'],"%Y-%m-%d %H:%M:%S.%f") + + + # result = {"name": os.getcwd(), "start": str(first_node), "finish": str(last_node), "duration": (last_node - first_node).total_seconds() / 60, "nodes": self.log_nodes} + + # log_content = json.dumps(result) + # log_file = open('log_anat_preproc.py', 'wb') + # log_file.write(log_content) + # log_file.close() + + print('- - - - - - - - - - - - - - - ', len(self.log_nodes), '- - - - - - - - - - - - - - - ') + print('No jobs waiting to execute') From 6f4690bde42bb219b1e59369cf93b2904fd8b7df Mon Sep 17 00:00:00 2001 From: carolFrohlich Date: Tue, 29 Sep 2015 17:19:35 -0400 Subject: [PATCH 02/45] callback functions write log --- nipype/pipeline/plugins/callback_log.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/nipype/pipeline/plugins/callback_log.py b/nipype/pipeline/plugins/callback_log.py index cd8827bf29..1faec2b4c1 100644 --- a/nipype/pipeline/plugins/callback_log.py +++ b/nipype/pipeline/plugins/callback_log.py @@ -1,20 +1,11 @@ -import logging import datetime +import logging def log_nodes_cb(node, status): + logger = logging.getLogger('callback') if status == 'start': - print 'START', "name:",node.name, "id:", node._id, "start:", datetime.datetime.now(), "memory:", node._interface.memory,"num_threads:", node._interface.num_threads - logging.debug( - "name:",node.name, - "id:", node._id, - "start:", datetime.datetime.now(), - "memory:", node._interface.memory, - "num_threads:", node._interface.num_threads) + message = "name:",node.name, "id:", node._id, "start:", datetime.datetime.now(), "memory:", node._interface.memory, "num_threads:", node._interface.num_threads + logger.debug(message) else: - print 'FINISH', "name:",node.name, "id:", node._id, "finish:", datetime.datetime.now(), "memory:", node._interface.memory,"num_threads:", node._interface.num_threads - logging.debug( - "name:",node.name, - "id:", node._id, - "finish:", datetime.datetime.now(), - "memory:", node._interface.memory, - "num_threads:", node._interface.num_threads) \ No newline at end of file + message = "name:",node.name, "id:", node._id, "finish:", datetime.datetime.now(), "memory:", node._interface.memory, "num_threads:", node._interface.num_threads + logger.debug(message) \ No newline at end of file From 52da583c02d60e5272b036e7a64a9919300f8e50 Mon Sep 17 00:00:00 2001 From: carolFrohlich Date: Wed, 30 Sep 2015 14:00:22 -0400 Subject: [PATCH 03/45] fix multiproc tests. create lot 2 json converter --- nipype/pipeline/plugins/callback_log.py | 45 ++++- .../pipeline/plugins/tests/test_multiproc.py | 162 +++++++++++++++++- 2 files changed, 202 insertions(+), 5 deletions(-) diff --git a/nipype/pipeline/plugins/callback_log.py b/nipype/pipeline/plugins/callback_log.py index 1faec2b4c1..52e2621f60 100644 --- a/nipype/pipeline/plugins/callback_log.py +++ b/nipype/pipeline/plugins/callback_log.py @@ -4,8 +4,47 @@ def log_nodes_cb(node, status): logger = logging.getLogger('callback') if status == 'start': - message = "name:",node.name, "id:", node._id, "start:", datetime.datetime.now(), "memory:", node._interface.memory, "num_threads:", node._interface.num_threads + message = '{"name":' + '"' + node.name + '"' + ',"id":' + '"' + node._id + '"' + ',"start":' + '"' +str(datetime.datetime.now()) + '"' + ',"memory":' + str(node._interface.memory) + ',"num_threads":' + str(node._interface.num_threads) + '}' logger.debug(message) else: - message = "name:",node.name, "id:", node._id, "finish:", datetime.datetime.now(), "memory:", node._interface.memory, "num_threads:", node._interface.num_threads - logger.debug(message) \ No newline at end of file + message = '{"name":' + '"' + node.name + '"' + ',"id":' + '"' + node._id + '"' + ',"finish":' + '"' + str(datetime.datetime.now()) + '"' + ',"memory":' + str(node._interface.memory) + ',"num_threads":' + str(node._interface.num_threads) + '}' + logger.debug(message) + + + +import json +from dateutil import parser + +def convert_logcb_to_json(filename): + with open(filename, 'r') as content: + #read file separating each line + content = content.read() + lines = content.split('\n') + + #separate lines of starting nodes and ending nodes + starts = [ json.loads(x) for x in lines if '"start":' in x ] + ends = [json.loads(x) for x in lines if '"finish":' in x ] + + + + #foreach start, search its end. They have same name and id + #this line is O(n^2). refactor + for element in starts: + end = next((f for f in ends if (f['id'] == element['id'] and f['name'] == element['name'])), None) + + if end is not None: + element['finish'] = end['finish'] + else: + element['finish'] = element['start'] + + + first_node = starts[0]['start'] + last_node = ends[-1]['finish'] + + duration = parser.parse(last_node) - parser.parse(first_node) + + #sorted(starts, key=lambda e: parser.parse(e['start'])) # sort by age + result = {'start': first_node, 'finish': last_node, 'duration':duration.total_seconds(), 'nodes': starts} + #finally, save the json file + with open(filename + '.json', 'w') as outfile: + json.dump(result, outfile) \ No newline at end of file diff --git a/nipype/pipeline/plugins/tests/test_multiproc.py b/nipype/pipeline/plugins/tests/test_multiproc.py index 66f755da9a..e0fdce7255 100644 --- a/nipype/pipeline/plugins/tests/test_multiproc.py +++ b/nipype/pipeline/plugins/tests/test_multiproc.py @@ -3,7 +3,7 @@ from tempfile import mkdtemp from shutil import rmtree -from nipype.testing import assert_equal +from nipype.testing import assert_equal, assert_less_equal import nipype.pipeline.engine as pe class InputSpec(nib.TraitedSpec): @@ -46,4 +46,162 @@ def test_run_multiproc(): result = node.get_output('output1') yield assert_equal, result, [1, 1] os.chdir(cur_dir) - rmtree(temp_dir) \ No newline at end of file + rmtree(temp_dir) + + + +################################# + + +class InputSpecSingleNode(nib.TraitedSpec): + input1 = nib.traits.Int(desc='a random int') + input2 = nib.traits.Int(desc='a random int') + +class OutputSpecSingleNode(nib.TraitedSpec): + output1 = nib.traits.Int(desc='a random int') + + +class TestInterfaceSingleNode(nib.BaseInterface): + input_spec = InputSpecSingleNode + output_spec = OutputSpecSingleNode + + def _run_interface(self, runtime): + runtime.returncode = 0 + return runtime + + def _list_outputs(self): + outputs = self._outputs().get() + outputs['output1'] = self.inputs.input1 + return outputs + + +def parse_log(filename, measure): + import json + from dateutil.parser import parse + from datetime import datetime + import datetime as d + + json_data = open(filename).read() + data = json.loads(json_data) + total_duration = int(float(data['duration']) * 60) #total duration in seconds + + total = [] + for i in range(total_duration): + total.append(0) + + now = parse(data['start']) + for i in range(total_duration): + start_index = 0 + node_start = None + node_finish = None + + x = now + + for j in range(start_index, len(data['nodes'])): + node_start = parse(data['nodes'][j]['start']) + node_finish = parse(data['nodes'][j]['finish']) + + if node_start < x and node_finish > x: + total[i] += data['nodes'][j][measure] + start_index = j + + if node_start > x: + break + + now += d.timedelta(seconds=1) + + return total + + +import os +from nipype.pipeline.plugins.callback_log import log_nodes_cb, convert_logcb_to_json +import logging +import logging.handlers +def test_do_not_use_more_memory_then_specified(): + LOG_FILENAME = 'callback.log' + my_logger = logging.getLogger('callback') + my_logger.setLevel(logging.DEBUG) + + # Add the log message handler to the logger + handler = logging.FileHandler(LOG_FILENAME) + my_logger.addHandler(handler) + + max_memory = 10 + pipe = pe.Workflow(name='pipe') + n1 = pe.Node(interface=TestInterfaceSingleNode(), name='n1') + n2 = pe.Node(interface=TestInterfaceSingleNode(), name='n2') + n3 = pe.Node(interface=TestInterfaceSingleNode(), name='n3') + n4 = pe.Node(interface=TestInterfaceSingleNode(), name='n4') + + n1.interface.memory = 1 + n2.interface.memory = 1 + n3.interface.memory = 10 + n4.interface.memory = 1 + + pipe.connect(n1, 'output1', n2, 'input1') + pipe.connect(n1, 'output1', n3, 'input1') + pipe.connect(n2, 'output1', n4, 'input1') + pipe.connect(n3, 'output1', n4, 'input2') + n1.inputs.input1 = 10 + pipe.config['execution']['poll_sleep_duration'] = 1 + pipe.run(plugin='ResourceMultiProc', plugin_args={'memory': max_memory, 'status_callback': log_nodes_cb}) + + convert_logcb_to_json(LOG_FILENAME) + #memory usage in every second + memory = parse_log(LOG_FILENAME + '.json' , 'memory') + + result = True + for m in memory: + if m > max_memory: + result = False + break + + yield assert_equal, result, True + + os.remove(LOG_FILENAME) + os.remove(LOG_FILENAME + '.json') + + +def test_do_not_use_more_threads_then_specified(): + LOG_FILENAME = 'callback.log' + my_logger = logging.getLogger('callback') + my_logger.setLevel(logging.DEBUG) + + # Add the log message handler to the logger + handler = logging.FileHandler(LOG_FILENAME) + my_logger.addHandler(handler) + + max_threads = 10 + pipe = pe.Workflow(name='pipe') + n1 = pe.Node(interface=TestInterfaceSingleNode(), name='n1') + n2 = pe.Node(interface=TestInterfaceSingleNode(), name='n2') + n3 = pe.Node(interface=TestInterfaceSingleNode(), name='n3') + n4 = pe.Node(interface=TestInterfaceSingleNode(), name='n4') + + n1.interface.num_threads = 1 + n2.interface.num_threads = 1 + n3.interface.num_threads = 10 + n4.interface.num_threads = 1 + + pipe.connect(n1, 'output1', n2, 'input1') + pipe.connect(n1, 'output1', n3, 'input1') + pipe.connect(n2, 'output1', n4, 'input1') + pipe.connect(n3, 'output1', n4, 'input2') + n1.inputs.input1 = 10 + pipe.config['execution']['poll_sleep_duration'] = 1 + pipe.run(plugin='ResourceMultiProc', plugin_args={'n_procs': max_threads, 'status_callback': log_nodes_cb}) + + convert_logcb_to_json(LOG_FILENAME) + #memory usage in every second + threads = parse_log(LOG_FILENAME + '.json' , 'num_threads') + + result = True + for t in threads: + if t > max_threads: + result = False + break + + yield assert_equal, result, True + + os.remove(LOG_FILENAME) + os.remove(LOG_FILENAME + '.json') \ No newline at end of file From ffb4756e365e318f299f46c8d5e5747d34790c49 Mon Sep 17 00:00:00 2001 From: carolFrohlich Date: Wed, 30 Sep 2015 14:43:24 -0400 Subject: [PATCH 04/45] fix comments and logs --- nipype/pipeline/plugins/multiproc.py | 76 +++++-------------- .../pipeline/plugins/tests/test_multiproc.py | 4 +- 2 files changed, 23 insertions(+), 57 deletions(-) diff --git a/nipype/pipeline/plugins/multiproc.py b/nipype/pipeline/plugins/multiproc.py index 51de405542..88c843ddab 100644 --- a/nipype/pipeline/plugins/multiproc.py +++ b/nipype/pipeline/plugins/multiproc.py @@ -106,6 +106,8 @@ def _clear_task(self, taskid): from ..engine import (MapNode, str2bool) import datetime import psutil +from ... import logging +logger = logging.getLogger('workflow') class ResourceMultiProcPlugin(MultiProcPlugin): @@ -116,6 +118,9 @@ def __init__(self, plugin_args=None): self.log_nodes = [] def _send_procs_to_workers(self, updatehash=False, graph=None): + """ Sends jobs to workers when system resources are available. + Check memory and cores usage before running jobs. + """ executing_now = [] processors = cpu_count() memory = psutil.virtual_memory() @@ -126,41 +131,38 @@ def _send_procs_to_workers(self, updatehash=False, graph=None): if 'memory' in self.plugin_args: memory = self.plugin_args['memory'] - + # Check to see if a job is available jobids = np.flatnonzero((self.proc_pending == True) & (self.depidx.sum(axis=0) == 0).__array__()) - print('START, pending_tasks:', jobids) - #busy_processors = number of busy processors + #check available system resources by summing all threads and memory used busy_memory = 0 busy_processors = 0 for jobid in jobids: - print 'using memory:', jobid, self.procs[jobid]._interface.num_threads busy_memory+= self.procs[jobid]._interface.memory busy_processors+= self.procs[jobid]._interface.num_threads - free_memory = memory - busy_memory free_processors = processors - busy_processors - #jobids = all jobs without dependency not run + + #check all jobs without dependency not run jobids = np.flatnonzero((self.proc_done == False) & (self.depidx.sum(axis=0) == 0).__array__()) - #sort jobids first by memory and then by number of threads + #sort jobs ready to run first by memory and then by number of threads + #The most resource consuming jobs run first jobids = sorted(jobids, key=lambda item: (self.procs[item]._interface.memory, self.procs[item]._interface.num_threads)) - print('jobids ->', jobids) - print 'free memory ->', free_memory, ', free processors ->', free_processors + logger.debug('Free memory: %d, Free processors: %d', free_memory, free_processors) #while have enough memory and processors for first job #submit first job on the list for jobid in jobids: - print 'next_job ->', jobid, 'memory:', self.procs[jobid]._interface.memory, 'threads:', self.procs[jobid]._interface.num_threads + logger.debug('Next Job: %d, memory: %d, threads: %d' %(jobid, self.procs[jobid]._interface.memory, self.procs[jobid]._interface.num_threads)) - print 'can job execute?', self.procs[jobid]._interface.memory <= free_memory and self.procs[jobid]._interface.num_threads <= free_processors if self.procs[jobid]._interface.memory <= free_memory and self.procs[jobid]._interface.num_threads <= free_processors: - print('Executing: %s ID: %d' %(self.procs[jobid]._id, jobid)) + logger.info('Executing: %s ID: %d' %(self.procs[jobid]._id, jobid)) executing_now.append(self.procs[jobid]) if isinstance(self.procs[jobid], MapNode): @@ -175,24 +177,23 @@ def _send_procs_to_workers(self, updatehash=False, graph=None): if not submit: continue - + # change job status in appropriate queues self.proc_done[jobid] = True self.proc_pending[jobid] = True free_memory -= self.procs[jobid]._interface.memory free_processors -= self.procs[jobid]._interface.num_threads + # Send job to task manager and add to pending tasks if self._status_callback: self._status_callback(self.procs[jobid], 'start') - - if str2bool(self.procs[jobid].config['execution']['local_hash_check']): - print('checking hash locally') + logger.debug('checking hash locally') try: hash_exists, _, _, _ = self.procs[ jobid].hash_exists() - print('Hash exists %s' % str(hash_exists)) + logger.debug('Hash exists %s' % str(hash_exists)) if (hash_exists and (self.procs[jobid].overwrite == False or (self.procs[jobid].overwrite == None and not self.procs[jobid]._interface.always_run))): self._task_finished_cb(jobid) self._remove_node_dirs() @@ -201,13 +202,10 @@ def _send_procs_to_workers(self, updatehash=False, graph=None): self._clean_queue(jobid, graph) self.proc_pending[jobid] = False continue - - - print('Finished checking hash') - + logger.debug('Finished checking hash') if self.procs[jobid].run_without_submitting: - print('Running node %s on master thread' %self.procs[jobid]) + logger.debug('Running node %s on master thread' %self.procs[jobid]) try: self.procs[jobid].run() except Exception: @@ -226,36 +224,4 @@ def _send_procs_to_workers(self, updatehash=False, graph=None): else: break - #run this code when not running each node - # max_node = datetime.datetime.min - # for n in executing_now: - # name = n.name - # start = self.current_time - # finish = self.current_time + n._interface.time - # duration = (finish - start).total_seconds() - # memory = n._interface.memory - # num_threads = n._interface.num_threads - - # if finish > max_node: - # max_node = finish - - # self.log_nodes.append({'name': name, 'start': str(start), 'finish': str(finish), 'duration': duration, 'memory':memory, 'num_threads': num_threads}) - - - # if len(executing_now) > 0: - # self.current_time = finish - # #write log - # self.log_nodes = sorted(self.log_nodes, key=lambda n: datetime.datetime.strptime(n['start'],"%Y-%m-%d %H:%M:%S.%f")) - # first_node = datetime.datetime.strptime(self.log_nodes[0]['start'],"%Y-%m-%d %H:%M:%S.%f") - # last_node = datetime.datetime.strptime(self.log_nodes[-1]['finish'],"%Y-%m-%d %H:%M:%S.%f") - - - # result = {"name": os.getcwd(), "start": str(first_node), "finish": str(last_node), "duration": (last_node - first_node).total_seconds() / 60, "nodes": self.log_nodes} - - # log_content = json.dumps(result) - # log_file = open('log_anat_preproc.py', 'wb') - # log_file.write(log_content) - # log_file.close() - - print('- - - - - - - - - - - - - - - ', len(self.log_nodes), '- - - - - - - - - - - - - - - ') - print('No jobs waiting to execute') + logger.debug('No jobs waiting to execute') \ No newline at end of file diff --git a/nipype/pipeline/plugins/tests/test_multiproc.py b/nipype/pipeline/plugins/tests/test_multiproc.py index e0fdce7255..dde99d76a0 100644 --- a/nipype/pipeline/plugins/tests/test_multiproc.py +++ b/nipype/pipeline/plugins/tests/test_multiproc.py @@ -83,7 +83,7 @@ def parse_log(filename, measure): json_data = open(filename).read() data = json.loads(json_data) - total_duration = int(float(data['duration']) * 60) #total duration in seconds + total_duration = int(float(data['duration'])) #total duration in seconds total = [] for i in range(total_duration): @@ -192,7 +192,7 @@ def test_do_not_use_more_threads_then_specified(): pipe.run(plugin='ResourceMultiProc', plugin_args={'n_procs': max_threads, 'status_callback': log_nodes_cb}) convert_logcb_to_json(LOG_FILENAME) - #memory usage in every second + #threads usage in every second threads = parse_log(LOG_FILENAME + '.json' , 'num_threads') result = True From 0890e8163032a9fa4d2b4c28a5e0f9dfbbfa6c67 Mon Sep 17 00:00:00 2001 From: carolFrohlich Date: Thu, 1 Oct 2015 14:08:43 -0400 Subject: [PATCH 05/45] fix tests --- .../pipeline/plugins/tests/test_multiproc.py | 37 +++++++++++++++---- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/nipype/pipeline/plugins/tests/test_multiproc.py b/nipype/pipeline/plugins/tests/test_multiproc.py index dde99d76a0..645d3bc567 100644 --- a/nipype/pipeline/plugins/tests/test_multiproc.py +++ b/nipype/pipeline/plugins/tests/test_multiproc.py @@ -85,9 +85,11 @@ def parse_log(filename, measure): data = json.loads(json_data) total_duration = int(float(data['duration'])) #total duration in seconds - total = [] + total_memory = [] + total_threads = [] for i in range(total_duration): - total.append(0) + total_memory.append(0) + total_threads.append(0) now = parse(data['start']) for i in range(total_duration): @@ -102,7 +104,8 @@ def parse_log(filename, measure): node_finish = parse(data['nodes'][j]['finish']) if node_start < x and node_finish > x: - total[i] += data['nodes'][j][measure] + total_memory[i] += data['nodes'][j]['memory'] + total_threads[i] += data['nodes'][j]['num_threads'] start_index = j if node_start > x: @@ -110,13 +113,15 @@ def parse_log(filename, measure): now += d.timedelta(seconds=1) - return total + return total_memory, total_threads import os from nipype.pipeline.plugins.callback_log import log_nodes_cb, convert_logcb_to_json import logging import logging.handlers +import psutil +from multiprocessing import cpu_count def test_do_not_use_more_memory_then_specified(): LOG_FILENAME = 'callback.log' my_logger = logging.getLogger('callback') @@ -148,7 +153,7 @@ def test_do_not_use_more_memory_then_specified(): convert_logcb_to_json(LOG_FILENAME) #memory usage in every second - memory = parse_log(LOG_FILENAME + '.json' , 'memory') + memory, threads = parse_log(LOG_FILENAME + '.json' , 'memory') result = True for m in memory: @@ -158,6 +163,16 @@ def test_do_not_use_more_memory_then_specified(): yield assert_equal, result, True + max_threads = cpu_count() + + result = True + for t in threads: + if t > max_threads: + result = False + break + + yield assert_equal, result, True, "using more threads than system has (threads is not specified by user)" + os.remove(LOG_FILENAME) os.remove(LOG_FILENAME + '.json') @@ -193,7 +208,7 @@ def test_do_not_use_more_threads_then_specified(): convert_logcb_to_json(LOG_FILENAME) #threads usage in every second - threads = parse_log(LOG_FILENAME + '.json' , 'num_threads') + memory, threads = parse_log(LOG_FILENAME + '.json' , 'num_threads') result = True for t in threads: @@ -201,7 +216,15 @@ def test_do_not_use_more_threads_then_specified(): result = False break - yield assert_equal, result, True + yield assert_equal, result, True, "using more threads than specified" + + max_memory = psutil.virtual_memory().total / (1024*1024) + result = True + for m in memory: + if m > max_memory: + result = False + break + yield assert_equal, result, True, "using more memory than system has (memory is not specified by user)" os.remove(LOG_FILENAME) os.remove(LOG_FILENAME + '.json') \ No newline at end of file From b3c6afc83b1d5fcb68d7a2b45ee24b71d6b40548 Mon Sep 17 00:00:00 2001 From: dclark87 Date: Tue, 6 Oct 2015 15:24:18 -0400 Subject: [PATCH 06/45] Modified the DataSink class and DataSinkInputSpec class to be able to handle uploading data to S3 by including "s3://bucket_name/.." in the base_directory, passes all unittests in https://github.com/FCP-INDI/C-PAC/blob/test_dev/test/unit/nipype/datasink_test.py --- nipype/interfaces/io.py | 398 +++++++++++++++++++++++++++++++++++----- 1 file changed, 352 insertions(+), 46 deletions(-) diff --git a/nipype/interfaces/io.py b/nipype/interfaces/io.py index 39ae774c21..b35765c62e 100644 --- a/nipype/interfaces/io.py +++ b/nipype/interfaces/io.py @@ -131,31 +131,85 @@ def _add_output_traits(self, base): return base +# Class to track percentage of S3 file upload +class ProgressPercentage(object): + ''' + Call-able class instsance (via __call__ method) that displays + upload percentage of a file to S3 + ''' + + def __init__(self, filename): + ''' + ''' + + # Import packages + import threading + import os + + # Initialize data attributes + self._filename = filename + self._size = float(os.path.getsize(filename)) + self._seen_so_far = 0 + self._lock = threading.Lock() + + def __call__(self, bytes_amount): + ''' + ''' + + # Import packages + import sys + + # With the lock on, print upload status + with self._lock: + self._seen_so_far += bytes_amount + percentage = (self._seen_so_far / self._size) * 100 + progress_str = '%d / %d (%.2f%%)\r'\ + % (self._seen_so_far, self._size, percentage) + + # Write to stdout + sys.stdout.write(progress_str) + sys.stdout.flush() + + +# DataSink inputs class DataSinkInputSpec(DynamicTraitedSpec, BaseInterfaceInputSpec): + ''' + ''' + + # Init inputspec data attributes base_directory = Directory( desc='Path to the base directory for storing data.') container = traits.Str( desc='Folder within base directory in which to store output') parameterization = traits.Bool(True, usedefault=True, desc='store output in parametrized structure') - strip_dir = Directory(desc='path to strip out of filename') + strip_dir = traits.Directory(desc='path to strip out of filename') substitutions = InputMultiPath(traits.Tuple(traits.Str, traits.Str), desc=('List of 2-tuples reflecting string ' 'to substitute and string to replace ' 'it with')) - regexp_substitutions = InputMultiPath(traits.Tuple(traits.Str, traits.Str), - desc=('List of 2-tuples reflecting a pair ' - 'of a Python regexp pattern and a ' - 'replacement string. Invoked after ' - 'string `substitutions`')) + regexp_substitutions = \ + InputMultiPath(traits.Tuple(traits.Str, traits.Str), + desc=('List of 2-tuples reflecting a pair of a '\ + 'Python regexp pattern and a replacement '\ + 'string. Invoked after string `substitutions`')) _outputs = traits.Dict(traits.Str, value={}, usedefault=True) remove_dest_dir = traits.Bool(False, usedefault=True, desc='remove dest directory when copying dirs') + # AWS S3 data attributes + creds_path = traits.Str(desc='Filepath to AWS credentials file for S3 bucket '\ + 'access') + encrypt_bucket_keys = traits.Bool(desc='Flag indicating whether to use S3 '\ + 'server-side AES-256 encryption') + + # Set call-able inputs attributes def __setattr__(self, key, value): + import nipype.interfaces.traits_extension as nit + if key not in self.copyable_trait_names(): - if not isdefined(value): + if not nit.isdefined(value): super(DataSinkInputSpec, self).__setattr__(key, value) self._outputs[key] = value else: @@ -164,11 +218,19 @@ def __setattr__(self, key, value): super(DataSinkInputSpec, self).__setattr__(key, value) +# DataSink outputs class DataSinkOutputSpec(TraitedSpec): + ''' + ''' + + # Import packages + import traits.api as tapi - out_file = traits.Any(desc='datasink output') + # Init out file + out_file = tapi.Any(desc='datasink output') +# Custom DataSink class class DataSink(IOBase): """ Generic datasink module to store structured outputs @@ -230,9 +292,12 @@ class DataSink(IOBase): >>> ds.run() # doctest: +SKIP """ + + # Give obj .inputs and .outputs input_spec = DataSinkInputSpec output_spec = DataSinkOutputSpec + # Initialization method to set up datasink def __init__(self, infields=None, force_run=True, **kwargs): """ Parameters @@ -254,6 +319,7 @@ def __init__(self, infields=None, force_run=True, **kwargs): if force_run: self._always_run = True + # Get destination paths def _get_dst(self, src): # If path is directory with trailing os.path.sep, # then remove that for a more robust behavior @@ -277,6 +343,7 @@ def _get_dst(self, src): dst = dst[1:] return dst + # Substitute paths in substitutions dictionary parameter def _substitute(self, pathstr): pathstr_ = pathstr if isdefined(self.inputs.substitutions): @@ -297,17 +364,251 @@ def _substitute(self, pathstr): iflogger.info('sub: %s -> %s' % (pathstr_, pathstr)) return pathstr + # Check for s3 in base directory + def _check_s3_base_dir(self): + ''' + Method to see if the datasink's base directory specifies an + S3 bucket path; it it does, it parses the path for the bucket + name in the form 's3://bucket_name/...' and adds a bucket + attribute to the data sink instance, i.e. self.bucket + + Parameters + ---------- + + Returns + ------- + s3_flag : boolean + flag indicating whether the base_directory contained an + S3 bucket path + ''' + + # Import packages + import os + import sys + + # Init variables + s3_str = 's3://' + sep = os.path.sep + base_directory = self.inputs.base_directory + + # Check if 's3://' in base dir + if base_directory.startswith(s3_str): + try: + # Expects bucket name to be 's3://bucket_name/base_dir/..' + bucket_name = base_directory.split(s3_str)[1].split(sep)[0] + # Get the actual bucket object + self.bucket = self._fetch_bucket(bucket_name) + except Exception as exc: + err_msg = 'Unable to access S3 bucket. Error:\n%s. Exiting...'\ + % exc + print err_msg + sys.exit() + # Bucket access was a success, set flag + s3_flag = True + # Otherwise it's just a normal datasink + else: + s3_flag = False + + # Return s3_flag + return s3_flag + + # Function to return AWS secure environment variables + def _return_aws_keys(self, creds_path): + ''' + Method to return AWS access key id and secret access key using + credentials found in a local file. + + Parameters + ---------- + creds_path : string (filepath) + path to the csv file with 'AWSAccessKeyId=' followed by access + key in the first row and 'AWSSecretAccessKey=' followed by + secret access key in the second row + + Returns + ------- + aws_access_key_id : string + string of the AWS access key ID + aws_secret_access_key : string + string of the AWS secret access key + ''' + + # Import packages + import csv + + # Init variables + csv_reader = csv.reader(open(creds_path, 'r')) + + # Grab csv rows + row1 = csv_reader.next()[0] + row2 = csv_reader.next()[0] + + # And split out for keys + aws_access_key_id = row1.split('=')[1] + aws_secret_access_key = row2.split('=')[1] + + # Return keys + return aws_access_key_id, aws_secret_access_key + + # Fetch bucket object + def _fetch_bucket(self, bucket_name): + ''' + Method to a return a bucket object which can be used to interact + with an AWS S3 bucket using credentials found in a local file. + + Parameters + ---------- + bucket_name : string + string corresponding to the name of the bucket on S3 + + Returns + ------- + bucket : boto3.resources.factory.s3.Bucket + boto3 s3 Bucket object which is used to interact with files + in an S3 bucket on AWS + ''' + + # Import packages + import logging + + try: + import boto3 + import botocore + except ImportError as exc: + err_msg = 'Boto3 package is not installed - install boto3 and '\ + 'try again.' + raise Exception(err_msg) + + # Init variables + creds_path = self.inputs.creds_path + iflogger = logging.getLogger('interface') + + # Try and get AWS credentials if a creds_path is specified + if creds_path: + try: + aws_access_key_id, aws_secret_access_key = \ + self._return_aws_keys(creds_path) + except Exception as exc: + err_msg = 'There was a problem extracting the AWS credentials '\ + 'from the credentials file provided: %s. Error:\n%s'\ + % (creds_path, exc) + raise Exception(err_msg) + # Init connection + iflogger.info('Connecting to S3 bucket: %s with credentials from '\ + '%s ...' % (bucket_name, creds_path)) + # Use individual session for each instance of DataSink + # Better when datasinks are being used in multi-threading, see: + # http://boto3.readthedocs.org/en/latest/guide/resources.html#multithreading + session = boto3.session.Session(aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key) + s3_resource = session.resource('s3', use_ssl=True) + + # Otherwise, connect anonymously + else: + iflogger.info('Connecting to AWS: %s anonymously...'\ + % bucket_name) + session = boto3.session.Session() + s3_resource = session.resource('s3', use_ssl=True) + s3_resource.meta.client.meta.events.register('choose-signer.s3.*', + botocore.handlers.disable_signing) + + # Explicitly declare a secure SSL connection for bucket object + bucket = s3_resource.Bucket(bucket_name) + + # And try fetch the bucket with the name argument + try: + s3_resource.meta.client.head_bucket(Bucket=bucket_name) + except botocore.exceptions.ClientError as exc: + error_code = int(exc.response['Error']['Code']) + if error_code == 403: + err_msg = 'Access to bucket: %s is denied; check credentials'\ + % bucket_name + raise Exception(err_msg) + elif error_code == 404: + err_msg = 'Bucket: %s does not exist; check spelling and try '\ + 'again' % bucket_name + raise Exception(err_msg) + else: + err_msg = 'Unable to connect to bucket: %s. Error message:\n%s'\ + % (bucket_name, exc) + except Exception as exc: + err_msg = 'Unable to connect to bucket: %s. Error message:\n%s'\ + % (bucket_name, exc) + raise Exception(err_msg) + + # Return the bucket + return bucket + + # Send up to S3 method + def _upload_to_s3(self, src, dst): + ''' + Method to upload outputs to S3 bucket instead of on local disk + ''' + + # Import packages + import logging + import os + + # Init variables + bucket = self.bucket + iflogger = logging.getLogger('interface') + s3_str = 's3://' + s3_prefix = os.path.join(s3_str, bucket.name) + + # If src is a directory, collect files (this assumes dst is a dir too) + if os.path.isdir(src): + src_files = [] + for root, dirs, files in os.walk(src): + src_files.extend([os.path.join(root, fil) for fil in files]) + # Make the dst files have the dst folder as base dir + dst_files = [os.path.join(dst, src_f.split(src)[1]) \ + for src_f in src_files] + else: + src_files = [src] + dst_files = [dst] + + # Iterate over src and copy to dst + for src_idx, src_f in enumerate(src_files): + # Get destination filename/keyname + dst_f = dst_files[src_idx] + dst_k = dst_f.replace(s3_prefix, '').lstrip('/') + + # Copy file up to S3 (either encrypted or not) + iflogger.info('Copying %s to S3 bucket, %s, as %s...'\ + % (src_f, bucket.name, dst_f)) + if self.inputs.encrypt_bucket_keys: + extra_args = {'ServerSideEncryption' : 'AES256'} + else: + extra_args = {} + bucket.upload_file(src_f, dst_k, ExtraArgs=extra_args, + Callback=ProgressPercentage(src_f)) + + # List outputs, main run routine def _list_outputs(self): """Execute this module. """ + + # Init variables + iflogger = logging.getLogger('interface') outputs = self.output_spec().get() out_files = [] outdir = self.inputs.base_directory + use_hardlink = str2bool(config.get('execution', + 'try_hard_link_datasink')) + + # If base directory isn't given, assume current directory if not isdefined(outdir): outdir = '.' - outdir = os.path.abspath(outdir) + + # Check if base directory reflects S3-bucket upload + s3_flag = self._check_s3_base_dir() + if not s3_flag: + outdir = os.path.abspath(outdir) + + # If container input is given, append that to outdir if isdefined(self.inputs.container): outdir = os.path.join(outdir, self.inputs.container) + # Create the directory if it doesn't exist if not os.path.exists(outdir): try: os.makedirs(outdir) @@ -316,8 +617,8 @@ def _list_outputs(self): pass else: raise(inst) - use_hardlink = str2bool(config.get('execution', - 'try_hard_link_datasink') ) + + # Iterate through outputs attributes {key : path(s)} for key, files in self.inputs._outputs.items(): if not isdefined(files): continue @@ -334,44 +635,49 @@ def _list_outputs(self): if isinstance(files[0], list): files = [item for sublist in files for item in sublist] + # Iterate through passed-in source files for src in filename_to_list(files): + # Format src and dst files src = os.path.abspath(src) - if os.path.isfile(src): - dst = self._get_dst(src) - dst = os.path.join(tempoutdir, dst) - dst = self._substitute(dst) - path, _ = os.path.split(dst) - if not os.path.exists(path): - try: - os.makedirs(path) - except OSError, inst: - if 'File exists' in inst: - pass - else: - raise(inst) - iflogger.debug("copyfile: %s %s" % (src, dst)) - copyfile(src, dst, copy=True, hashmethod='content', - use_hardlink=use_hardlink) - out_files.append(dst) - elif os.path.isdir(src): - dst = self._get_dst(os.path.join(src, '')) - dst = os.path.join(tempoutdir, dst) - dst = self._substitute(dst) - path, _ = os.path.split(dst) - if not os.path.exists(path): - try: - os.makedirs(path) - except OSError, inst: - if 'File exists' in inst: - pass - else: - raise(inst) - if os.path.exists(dst) and self.inputs.remove_dest_dir: - iflogger.debug("removing: %s" % dst) - shutil.rmtree(dst) - iflogger.debug("copydir: %s %s" % (src, dst)) - copytree(src, dst) + if not os.path.isfile(src): + src = os.path.join(src, '') + dst = self._get_dst(src) + dst = os.path.join(tempoutdir, dst) + dst = self._substitute(dst) + path, _ = os.path.split(dst) + + # Create output directory if it doesnt exist + if not os.path.exists(path): + try: + os.makedirs(path) + except OSError, inst: + if 'File exists' in inst: + pass + else: + raise(inst) + + # If we're uploading to S3 + if s3_flag: + self._upload_to_s3(src, dst) out_files.append(dst) + # Otherwise, copy locally src -> dst + else: + # If src is a file, copy it to dst + if os.path.isfile(src): + iflogger.debug('copyfile: %s %s' % (src, dst)) + copyfile(src, dst, copy=True, hashmethod='content', + use_hardlink=use_hardlink) + out_files.append(dst) + # If src is a directory, copy entire contents to dst dir + elif os.path.isdir(src): + if os.path.exists(dst) and self.inputs.remove_dest_dir: + iflogger.debug('removing: %s' % dst) + shutil.rmtree(dst) + iflogger.debug('copydir: %s %s' % (src, dst)) + copytree(src, dst) + out_files.append(dst) + + # Return outputs dictionary outputs['out_file'] = out_files return outputs From 4b0255815933a8e1c94b4b5a7de16a345bc48783 Mon Sep 17 00:00:00 2001 From: dclark87 Date: Tue, 6 Oct 2015 15:58:33 -0400 Subject: [PATCH 07/45] Removed redundant imports --- nipype/interfaces/io.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/nipype/interfaces/io.py b/nipype/interfaces/io.py index b35765c62e..71bce269f6 100644 --- a/nipype/interfaces/io.py +++ b/nipype/interfaces/io.py @@ -183,7 +183,7 @@ class DataSinkInputSpec(DynamicTraitedSpec, BaseInterfaceInputSpec): desc='Folder within base directory in which to store output') parameterization = traits.Bool(True, usedefault=True, desc='store output in parametrized structure') - strip_dir = traits.Directory(desc='path to strip out of filename') + strip_dir = Directory(desc='path to strip out of filename') substitutions = InputMultiPath(traits.Tuple(traits.Str, traits.Str), desc=('List of 2-tuples reflecting string ' 'to substitute and string to replace ' @@ -206,10 +206,9 @@ class DataSinkInputSpec(DynamicTraitedSpec, BaseInterfaceInputSpec): # Set call-able inputs attributes def __setattr__(self, key, value): - import nipype.interfaces.traits_extension as nit if key not in self.copyable_trait_names(): - if not nit.isdefined(value): + if not isdefined(value): super(DataSinkInputSpec, self).__setattr__(key, value) self._outputs[key] = value else: From 42f0b1b2e50673be3044fdc055c03d1ef181183d Mon Sep 17 00:00:00 2001 From: dclark87 Date: Tue, 6 Oct 2015 16:01:58 -0400 Subject: [PATCH 08/45] Quick cosmetic fix --- nipype/interfaces/io.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nipype/interfaces/io.py b/nipype/interfaces/io.py index 71bce269f6..3137836642 100644 --- a/nipype/interfaces/io.py +++ b/nipype/interfaces/io.py @@ -592,8 +592,7 @@ def _list_outputs(self): outputs = self.output_spec().get() out_files = [] outdir = self.inputs.base_directory - use_hardlink = str2bool(config.get('execution', - 'try_hard_link_datasink')) + use_hardlink = str2bool(config.get('execution', 'try_hard_link_datasink')) # If base directory isn't given, assume current directory if not isdefined(outdir): From 872e7529e9bbc3daf8b38a9cfa68a284813db9fd Mon Sep 17 00:00:00 2001 From: carolFrohlich Date: Wed, 7 Oct 2015 12:52:34 -0400 Subject: [PATCH 09/45] scheduler does not sleep --- nipype/pipeline/plugins/base.py | 10 ++++++++- nipype/pipeline/plugins/multiproc.py | 22 +++++++++++++------ .../pipeline/plugins/semaphore_singleton.py | 3 +++ 3 files changed, 27 insertions(+), 8 deletions(-) create mode 100644 nipype/pipeline/plugins/semaphore_singleton.py diff --git a/nipype/pipeline/plugins/base.py b/nipype/pipeline/plugins/base.py index b987dcfc13..0ad681e1f7 100644 --- a/nipype/pipeline/plugins/base.py +++ b/nipype/pipeline/plugins/base.py @@ -16,6 +16,7 @@ import numpy as np import scipy.sparse as ssp +import semaphore_singleton from ..utils import (nx, dfs_preorder, topological_sort) @@ -261,9 +262,16 @@ def run(self, graph, config, updatehash=False): graph=graph) else: logger.debug('Not submitting') - sleep(float(self._config['execution']['poll_sleep_duration'])) + + print 'locking semaphore' + print 'pending tasks:', len(self.pending_tasks) + if len(self.pending_tasks) > 0: + semaphore_singleton.semaphore.acquire() + print 'semaphore was released' + #sleep(float(self._config['execution']['poll_sleep_duration'])) self._remove_node_dirs() report_nodes_not_run(notrun) + semaphore_singleton.semaphore.release() def _get_result(self, taskid): raise NotImplementedError diff --git a/nipype/pipeline/plugins/multiproc.py b/nipype/pipeline/plugins/multiproc.py index 88c843ddab..5f91ab9e19 100644 --- a/nipype/pipeline/plugins/multiproc.py +++ b/nipype/pipeline/plugins/multiproc.py @@ -11,6 +11,9 @@ import sys from .base import (DistributedPluginBase, report_crash) +import semaphore_singleton + + def run_node(node, updatehash): result = dict(result=None, traceback=None) @@ -22,6 +25,13 @@ def run_node(node, updatehash): result['result'] = node.result return result + + +def release_lock(args): + print 'releasing semaphore' + semaphore_singleton.semaphore.release() + + class NonDaemonProcess(Process): """A non-daemon process to support internal multiprocessing. """ @@ -66,6 +76,7 @@ def __init__(self, plugin_args=None): else: self.pool = Pool(processes=n_procs) + def _get_result(self, taskid): if taskid not in self._taskresult: raise RuntimeError('Multiproc task %d not found'%taskid) @@ -80,9 +91,8 @@ def _submit_job(self, node, updatehash=False): node.inputs.terminal_output = 'allatonce' except: pass - self._taskresult[self._taskid] = self.pool.apply_async(run_node, - (node, - updatehash,)) + self._taskresult[self._taskid] = self.pool.apply_async(run_node, (node, + updatehash,), callback=release_lock) return self._taskid def _report_crash(self, node, result=None): @@ -114,17 +124,15 @@ class ResourceMultiProcPlugin(MultiProcPlugin): def __init__(self, plugin_args=None): super(ResourceMultiProcPlugin, self).__init__(plugin_args=plugin_args) self.plugin_args = plugin_args - self.current_time = datetime.datetime.now() - self.log_nodes = [] def _send_procs_to_workers(self, updatehash=False, graph=None): """ Sends jobs to workers when system resources are available. - Check memory and cores usage before running jobs. + Check memory (mb) and cores usage before running jobs. """ executing_now = [] processors = cpu_count() memory = psutil.virtual_memory() - memory = memory.total + memory = memory.total / (1024*1024) if self.plugin_args: if 'n_procs' in self.plugin_args: processors = self.plugin_args['n_procs'] diff --git a/nipype/pipeline/plugins/semaphore_singleton.py b/nipype/pipeline/plugins/semaphore_singleton.py new file mode 100644 index 0000000000..b5b3ca79b9 --- /dev/null +++ b/nipype/pipeline/plugins/semaphore_singleton.py @@ -0,0 +1,3 @@ +print 'calling semaphore' +import threading +semaphore = threading.Semaphore(1) \ No newline at end of file From e465c281783ed165da01b5d3fc10615b76eaab2b Mon Sep 17 00:00:00 2001 From: carolFrohlich Date: Thu, 8 Oct 2015 11:30:10 -0400 Subject: [PATCH 10/45] clean code --- nipype/pipeline/plugins/base.py | 16 +++---- nipype/pipeline/plugins/callback_log.py | 13 +++++- nipype/pipeline/plugins/multiproc.py | 56 +++++++++++++++++++------ 3 files changed, 60 insertions(+), 25 deletions(-) diff --git a/nipype/pipeline/plugins/base.py b/nipype/pipeline/plugins/base.py index 0ad681e1f7..cee2c7dad5 100644 --- a/nipype/pipeline/plugins/base.py +++ b/nipype/pipeline/plugins/base.py @@ -16,8 +16,6 @@ import numpy as np import scipy.sparse as ssp -import semaphore_singleton - from ..utils import (nx, dfs_preorder, topological_sort) from ..engine import (MapNode, str2bool) @@ -262,16 +260,14 @@ def run(self, graph, config, updatehash=False): graph=graph) else: logger.debug('Not submitting') - - print 'locking semaphore' - print 'pending tasks:', len(self.pending_tasks) - if len(self.pending_tasks) > 0: - semaphore_singleton.semaphore.acquire() - print 'semaphore was released' - #sleep(float(self._config['execution']['poll_sleep_duration'])) + self._wait() self._remove_node_dirs() report_nodes_not_run(notrun) - semaphore_singleton.semaphore.release() + + + + def _wait(self): + sleep(float(self._config['execution']['poll_sleep_duration'])) def _get_result(self, taskid): raise NotImplementedError diff --git a/nipype/pipeline/plugins/callback_log.py b/nipype/pipeline/plugins/callback_log.py index 52e2621f60..9ca38027c1 100644 --- a/nipype/pipeline/plugins/callback_log.py +++ b/nipype/pipeline/plugins/callback_log.py @@ -4,10 +4,18 @@ def log_nodes_cb(node, status): logger = logging.getLogger('callback') if status == 'start': - message = '{"name":' + '"' + node.name + '"' + ',"id":' + '"' + node._id + '"' + ',"start":' + '"' +str(datetime.datetime.now()) + '"' + ',"memory":' + str(node._interface.memory) + ',"num_threads":' + str(node._interface.num_threads) + '}' + message = '{"name":' + '"' + node.name + '"' + ',"id":' + '"' +\ + node._id + '"' + ',"start":' + '"' +str(datetime.datetime.now()) +\ + '"' + ',"memory":' + str(node._interface.memory) + ',"num_threads":' \ + + str(node._interface.num_threads) + '}' + logger.debug(message) else: - message = '{"name":' + '"' + node.name + '"' + ',"id":' + '"' + node._id + '"' + ',"finish":' + '"' + str(datetime.datetime.now()) + '"' + ',"memory":' + str(node._interface.memory) + ',"num_threads":' + str(node._interface.num_threads) + '}' + message = '{"name":' + '"' + node.name + '"' + ',"id":' + '"' + \ + node._id + '"' + ',"finish":' + '"' + str(datetime.datetime.now()) +\ + '"' + ',"memory":' + str(node._interface.memory) + ',"num_threads":' \ + + str(node._interface.num_threads) + '}' + logger.debug(message) @@ -34,6 +42,7 @@ def convert_logcb_to_json(filename): if end is not None: element['finish'] = end['finish'] + element['duration'] = (parser.parse(element['finish']) - parser.parse(element['start'])).total_seconds() else: element['finish'] = element['start'] diff --git a/nipype/pipeline/plugins/multiproc.py b/nipype/pipeline/plugins/multiproc.py index 5f91ab9e19..53272aafc4 100644 --- a/nipype/pipeline/plugins/multiproc.py +++ b/nipype/pipeline/plugins/multiproc.py @@ -11,8 +11,6 @@ import sys from .base import (DistributedPluginBase, report_crash) -import semaphore_singleton - def run_node(node, updatehash): @@ -26,12 +24,6 @@ def run_node(node, updatehash): return result - -def release_lock(args): - print 'releasing semaphore' - semaphore_singleton.semaphore.release() - - class NonDaemonProcess(Process): """A non-daemon process to support internal multiprocessing. """ @@ -92,7 +84,7 @@ def _submit_job(self, node, updatehash=False): except: pass self._taskresult[self._taskid] = self.pool.apply_async(run_node, (node, - updatehash,), callback=release_lock) + updatehash,)) return self._taskid def _report_crash(self, node, result=None): @@ -109,25 +101,63 @@ def _clear_task(self, taskid): - - import numpy as np from copy import deepcopy from ..engine import (MapNode, str2bool) import datetime import psutil from ... import logging +import semaphore_singleton logger = logging.getLogger('workflow') +def release_lock(args): + semaphore_singleton.semaphore.release() + class ResourceMultiProcPlugin(MultiProcPlugin): + """Execute workflow with multiprocessing not sending more jobs at once + than the system can support. + + The plugin_args input to run can be used to control the multiprocessing + execution and defining the maximum amount of memory and threads that + should be used. + System consuming nodes should be tagged: + memory_consuming_node.interface.memory = 8 #Gb + thread_consuming_node.interface.num_threads = 16 + + The default number of threads and memory for a node is 1. + + Currently supported options are: + + - num_thread: maximum number of threads to be executed in parallel + - memory: maximum memory that can be used at once. + + """ def __init__(self, plugin_args=None): super(ResourceMultiProcPlugin, self).__init__(plugin_args=plugin_args) self.plugin_args = plugin_args + def _wait(self): + if len(self.pending_tasks) > 0: + semaphore_singleton.semaphore.acquire() + else: + semaphore_singleton.semaphore.release() + + + def _submit_job(self, node, updatehash=False): + self._taskid += 1 + try: + if node.inputs.terminal_output == 'stream': + node.inputs.terminal_output = 'allatonce' + except: + pass + self._taskresult[self._taskid] = self.pool.apply_async(run_node, (node, + updatehash,), callback=release_lock) + return self._taskid + def _send_procs_to_workers(self, updatehash=False, graph=None): """ Sends jobs to workers when system resources are available. - Check memory (mb) and cores usage before running jobs. + Check memory (gb) and cores usage before running jobs. """ executing_now = [] processors = cpu_count() @@ -222,7 +252,7 @@ def _send_procs_to_workers(self, updatehash=False, graph=None): self._remove_node_dirs() else: - print('submitting', jobid) + logger.debug('submitting', jobid) tid = self._submit_job(deepcopy(self.procs[jobid]), updatehash=updatehash) if tid is None: self.proc_done[jobid] = False From e49965ca84b39441addfcec00ae9deaf900973e6 Mon Sep 17 00:00:00 2001 From: carolFrohlich Date: Thu, 8 Oct 2015 14:19:00 -0400 Subject: [PATCH 11/45] draw gant chart, small fixes --- nipype/pipeline/plugins/callback_log.py | 42 +----- nipype/pipeline/plugins/draw_gantt_chart.py | 148 ++++++++++++++++++++ nipype/pipeline/plugins/multiproc.py | 6 +- 3 files changed, 153 insertions(+), 43 deletions(-) create mode 100644 nipype/pipeline/plugins/draw_gantt_chart.py diff --git a/nipype/pipeline/plugins/callback_log.py b/nipype/pipeline/plugins/callback_log.py index 9ca38027c1..c78356081d 100644 --- a/nipype/pipeline/plugins/callback_log.py +++ b/nipype/pipeline/plugins/callback_log.py @@ -15,45 +15,5 @@ def log_nodes_cb(node, status): node._id + '"' + ',"finish":' + '"' + str(datetime.datetime.now()) +\ '"' + ',"memory":' + str(node._interface.memory) + ',"num_threads":' \ + str(node._interface.num_threads) + '}' - - logger.debug(message) - - - -import json -from dateutil import parser - -def convert_logcb_to_json(filename): - with open(filename, 'r') as content: - #read file separating each line - content = content.read() - lines = content.split('\n') - - #separate lines of starting nodes and ending nodes - starts = [ json.loads(x) for x in lines if '"start":' in x ] - ends = [json.loads(x) for x in lines if '"finish":' in x ] - - - - #foreach start, search its end. They have same name and id - #this line is O(n^2). refactor - for element in starts: - end = next((f for f in ends if (f['id'] == element['id'] and f['name'] == element['name'])), None) - - if end is not None: - element['finish'] = end['finish'] - element['duration'] = (parser.parse(element['finish']) - parser.parse(element['start'])).total_seconds() - else: - element['finish'] = element['start'] - - - first_node = starts[0]['start'] - last_node = ends[-1]['finish'] - - duration = parser.parse(last_node) - parser.parse(first_node) - #sorted(starts, key=lambda e: parser.parse(e['start'])) # sort by age - result = {'start': first_node, 'finish': last_node, 'duration':duration.total_seconds(), 'nodes': starts} - #finally, save the json file - with open(filename + '.json', 'w') as outfile: - json.dump(result, outfile) \ No newline at end of file + logger.debug(message) \ No newline at end of file diff --git a/nipype/pipeline/plugins/draw_gantt_chart.py b/nipype/pipeline/plugins/draw_gantt_chart.py new file mode 100644 index 0000000000..0478f88639 --- /dev/null +++ b/nipype/pipeline/plugins/draw_gantt_chart.py @@ -0,0 +1,148 @@ +import json +from dateutil import parser +import datetime +import random + + +def log_to_json(logfile): + result = [] + with open(logfile, 'r') as content: + + #read file separating each line + content = content.read() + lines = content.split('\n') + + lines = [ json.loads(x) for x in lines[:-1]] + + last_node = [ x for x in lines if x.has_key('finish')][-1] + + for i, line in enumerate(lines): + #get first start it finds + if not line.has_key('start'): + continue + + #fint the end node for that start + for j in range(i+1, len(lines)): + if lines[j].has_key('finish'): + if lines[j]['id'] == line['id'] and lines[j]['name'] == line['name']: + line['finish'] = lines[j]['finish'] + line['duration'] = (parser.parse(line['finish']) - parser.parse(line['start'])).total_seconds() + result.append(line) + break + + return result, last_node + + +#total duration in seconds +def draw_lines(start, total_duration, minute_scale, scale): + result = '' + next_line = 220 + next_time = start; + num_lines = int((total_duration/60) / minute_scale) +2; + + for i in range(num_lines): + new_line = "
" + result += new_line + + time = "

" + str(next_time.hour) + ':' + str(next_time.minute) + "

"; + result += time + + next_line += minute_scale * scale + next_time += datetime.timedelta(minutes=minute_scale) + return result + +def draw_nodes(start, nodes, cores, scale, colors): + result = '' + end_times = [datetime.datetime(start.year, start.month, start.day, start.hour, start.minute, start.second) for x in range(cores)] + + for node in nodes: + node_start = parser.parse(node['start']) + node_finish = parser.parse(node['finish']) + offset = ((node_start - start).total_seconds() / 60) * scale + 220 + scale_duration = (node['duration'] / 60) * scale + if scale_duration < 5: + scale_duration = 5 + + scale_duration -= 2 + left = 60 + for j in range(len(end_times)): + if end_times[j] < node_start: + left += j * 30 + end_times[j] = datetime.datetime(node_finish.year, node_finish.month, node_finish.day, node_finish.hour, node_finish.minute, node_finish.second) + #end_times[j]+= datetime.timedelta(microseconds=node_finish.microsecond) + break + + color = random.choice(colors) + new_node = "
"; + result += new_node + return result + + +''' +Generates a gantt chart in html showing the workflow execution based on a callback log file. +This script was intended to be used with the ResourceMultiprocPlugin. +The following code shows how to set up the workflow in order to generate the log file: + +# import logging +# import logging.handlers +# from nipype.pipeline.plugins.callback_log import log_nodes_cb + +# log_filename = 'callback.log' +# logger = logging.getLogger('callback') +# logger.setLevel(logging.DEBUG) +# handler = logging.FileHandler(log_filename) +# logger.addHandler(handler) + +# #create workflow +# workflow = ... + +# workflow.run(plugin='ResourceMultiProc', +# plugin_args={'num_threads':8, 'memory':12, 'status_callback': log_nodes_cb}) + +# generate_gantt_chart('callback.log', 8) +''' +def generate_gantt_chart(logfile, cores, minute_scale=10, space_between_minutes=50, colors=["#7070FF", "#4E4EB2", "#2D2D66", "#9B9BFF"]): + + result, last_node = log_to_json(logfile) + scale = space_between_minutes + + #add the html header + html_string = ''' + + + + + +
''' + + + #create the header of the report with useful information + start = parser.parse(result[0]['start']) + duration = int((parser.parse(last_node['finish']) - start).total_seconds()) + + html_string += '

Start: '+ result[0]['start'] +'

' + html_string += '

Finish: '+ last_node['finish'] +'

' + html_string += '

Duration: '+ str(duration/60) +' minutes

' + html_string += '

Nodes: '+str(len(result))+'

' + html_string += '

Cores: '+str(cores)+'

' + + + #draw lines + html_string += draw_lines(start, duration, minute_scale, scale) + + #draw nodes + html_string += draw_nodes(start, result, cores, scale, colors) + + + #finish html + html_string+= ''' +
+ ''' + + #save file + html_file = open(logfile +'.html', 'wb') + html_file.write(html_string) + html_file.close() + + +generate_gantt_chart('/home/caroline/Desktop/callback.log', 8) \ No newline at end of file diff --git a/nipype/pipeline/plugins/multiproc.py b/nipype/pipeline/plugins/multiproc.py index 53272aafc4..8f14825ddd 100644 --- a/nipype/pipeline/plugins/multiproc.py +++ b/nipype/pipeline/plugins/multiproc.py @@ -119,7 +119,9 @@ class ResourceMultiProcPlugin(MultiProcPlugin): The plugin_args input to run can be used to control the multiprocessing execution and defining the maximum amount of memory and threads that - should be used. + should be used. When those parameters are not specified, + the number of threads and memory of the system is used. + System consuming nodes should be tagged: memory_consuming_node.interface.memory = 8 #Gb thread_consuming_node.interface.num_threads = 16 @@ -162,7 +164,7 @@ def _send_procs_to_workers(self, updatehash=False, graph=None): executing_now = [] processors = cpu_count() memory = psutil.virtual_memory() - memory = memory.total / (1024*1024) + memory = memory.total / (1024*1024*1024) if self.plugin_args: if 'n_procs' in self.plugin_args: processors = self.plugin_args['n_procs'] From 34acdf8f8a3285ec80c1aa9def49a30e1eaa9702 Mon Sep 17 00:00:00 2001 From: carolFrohlich Date: Thu, 8 Oct 2015 17:06:41 -0400 Subject: [PATCH 12/45] add memory and thread to gantt chart, callback handles errors --- nipype/pipeline/plugins/callback_log.py | 12 +++++++++- nipype/pipeline/plugins/multiproc.py | 23 +++++++++---------- .../plugins => utils}/draw_gantt_chart.py | 5 +--- 3 files changed, 23 insertions(+), 17 deletions(-) rename nipype/{pipeline/plugins => utils}/draw_gantt_chart.py (98%) diff --git a/nipype/pipeline/plugins/callback_log.py b/nipype/pipeline/plugins/callback_log.py index c78356081d..34952864b7 100644 --- a/nipype/pipeline/plugins/callback_log.py +++ b/nipype/pipeline/plugins/callback_log.py @@ -2,6 +2,7 @@ import logging def log_nodes_cb(node, status): + print 'status', status logger = logging.getLogger('callback') if status == 'start': message = '{"name":' + '"' + node.name + '"' + ',"id":' + '"' +\ @@ -10,10 +11,19 @@ def log_nodes_cb(node, status): + str(node._interface.num_threads) + '}' logger.debug(message) - else: + + elif status == 'end': message = '{"name":' + '"' + node.name + '"' + ',"id":' + '"' + \ node._id + '"' + ',"finish":' + '"' + str(datetime.datetime.now()) +\ '"' + ',"memory":' + str(node._interface.memory) + ',"num_threads":' \ + str(node._interface.num_threads) + '}' + logger.debug(message) + + else: + message = '{"name":' + '"' + node.name + '"' + ',"id":' + '"' + \ + node._id + '"' + ',"finish":' + '"' + str(datetime.datetime.now()) +\ + '"' + ',"memory":' + str(node._interface.memory) + ',"num_threads":' \ + + str(node._interface.num_threads) + ',"error":"True"}' + logger.debug(message) \ No newline at end of file diff --git a/nipype/pipeline/plugins/multiproc.py b/nipype/pipeline/plugins/multiproc.py index 8f14825ddd..8d66be6999 100644 --- a/nipype/pipeline/plugins/multiproc.py +++ b/nipype/pipeline/plugins/multiproc.py @@ -138,12 +138,19 @@ class ResourceMultiProcPlugin(MultiProcPlugin): def __init__(self, plugin_args=None): super(ResourceMultiProcPlugin, self).__init__(plugin_args=plugin_args) self.plugin_args = plugin_args + self.processors = cpu_count() + memory = psutil.virtual_memory() + self.memory = memory.total / (1024*1024*1024) + if self.plugin_args: + if 'n_procs' in self.plugin_args: + self.processors = self.plugin_args['n_procs'] + if 'memory' in self.plugin_args: + self.memory = self.plugin_args['memory'] def _wait(self): if len(self.pending_tasks) > 0: semaphore_singleton.semaphore.acquire() - else: - semaphore_singleton.semaphore.release() + semaphore_singleton.semaphore.release() def _submit_job(self, node, updatehash=False): @@ -162,14 +169,6 @@ def _send_procs_to_workers(self, updatehash=False, graph=None): Check memory (gb) and cores usage before running jobs. """ executing_now = [] - processors = cpu_count() - memory = psutil.virtual_memory() - memory = memory.total / (1024*1024*1024) - if self.plugin_args: - if 'n_procs' in self.plugin_args: - processors = self.plugin_args['n_procs'] - if 'memory' in self.plugin_args: - memory = self.plugin_args['memory'] # Check to see if a job is available jobids = np.flatnonzero((self.proc_pending == True) & (self.depidx.sum(axis=0) == 0).__array__()) @@ -181,8 +180,8 @@ def _send_procs_to_workers(self, updatehash=False, graph=None): busy_memory+= self.procs[jobid]._interface.memory busy_processors+= self.procs[jobid]._interface.num_threads - free_memory = memory - busy_memory - free_processors = processors - busy_processors + free_memory = self.memory - busy_memory + free_processors = self.processors - busy_processors #check all jobs without dependency not run diff --git a/nipype/pipeline/plugins/draw_gantt_chart.py b/nipype/utils/draw_gantt_chart.py similarity index 98% rename from nipype/pipeline/plugins/draw_gantt_chart.py rename to nipype/utils/draw_gantt_chart.py index 0478f88639..85ae66ddb8 100644 --- a/nipype/pipeline/plugins/draw_gantt_chart.py +++ b/nipype/utils/draw_gantt_chart.py @@ -142,7 +142,4 @@ def generate_gantt_chart(logfile, cores, minute_scale=10, space_between_minutes= #save file html_file = open(logfile +'.html', 'wb') html_file.write(html_string) - html_file.close() - - -generate_gantt_chart('/home/caroline/Desktop/callback.log', 8) \ No newline at end of file + html_file.close() \ No newline at end of file From c9c92ef9181ae285178d4dd949af3cf360e17235 Mon Sep 17 00:00:00 2001 From: dclark87 Date: Thu, 8 Oct 2015 18:09:47 -0400 Subject: [PATCH 13/45] Added handling of DataSink to save to a local directory if it cant access S3 --- nipype/interfaces/io.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/nipype/interfaces/io.py b/nipype/interfaces/io.py index 3137836642..97812373b2 100644 --- a/nipype/interfaces/io.py +++ b/nipype/interfaces/io.py @@ -400,8 +400,7 @@ def _check_s3_base_dir(self): except Exception as exc: err_msg = 'Unable to access S3 bucket. Error:\n%s. Exiting...'\ % exc - print err_msg - sys.exit() + raise Exception(err_msg) # Bucket access was a success, set flag s3_flag = True # Otherwise it's just a normal datasink @@ -599,7 +598,18 @@ def _list_outputs(self): outdir = '.' # Check if base directory reflects S3-bucket upload - s3_flag = self._check_s3_base_dir() + try: + s3_flag = self._check_s3_base_dir() + # If encountering an exception during bucket access, set output + # base directory to a local folder + except Exception as exc: + local_out_exception = os.path.join(os.path.expanduser('~'), + 'data_output') + iflogger.info('Access to S3 failed! Storing outputs locally at: '\ + '%s\nError: %s' %(local_out_exception, exc)) + self.inputs.base_directory = local_out_exception + + # If not accessing S3, just set outdir to local absolute path if not s3_flag: outdir = os.path.abspath(outdir) From cb07b5ab4d1079ff759989d7c6443d852874954a Mon Sep 17 00:00:00 2001 From: carolFrohlich Date: Fri, 9 Oct 2015 11:01:54 -0400 Subject: [PATCH 14/45] add tests --- .../pipeline/plugins/semaphore_singleton.py | 1 - .../pipeline/plugins/tests/test_multiproc.py | 52 ++++---- nipype/utils/draw_gantt_chart.py | 118 +++++++++++++++++- 3 files changed, 144 insertions(+), 27 deletions(-) diff --git a/nipype/pipeline/plugins/semaphore_singleton.py b/nipype/pipeline/plugins/semaphore_singleton.py index b5b3ca79b9..8894615a14 100644 --- a/nipype/pipeline/plugins/semaphore_singleton.py +++ b/nipype/pipeline/plugins/semaphore_singleton.py @@ -1,3 +1,2 @@ -print 'calling semaphore' import threading semaphore = threading.Semaphore(1) \ No newline at end of file diff --git a/nipype/pipeline/plugins/tests/test_multiproc.py b/nipype/pipeline/plugins/tests/test_multiproc.py index 645d3bc567..d7e6b1661a 100644 --- a/nipype/pipeline/plugins/tests/test_multiproc.py +++ b/nipype/pipeline/plugins/tests/test_multiproc.py @@ -26,7 +26,7 @@ def _list_outputs(self): outputs['output1'] = [1, self.inputs.input1] return outputs -def test_run_multiproc(): +def run_multiproc(): cur_dir = os.getcwd() temp_dir = mkdtemp(prefix='test_engine_') os.chdir(temp_dir) @@ -49,8 +49,7 @@ def test_run_multiproc(): rmtree(temp_dir) - -################################# +################################ class InputSpecSingleNode(nib.TraitedSpec): @@ -75,15 +74,15 @@ def _list_outputs(self): return outputs -def parse_log(filename, measure): +def find_metrics(nodes, last_node): import json from dateutil.parser import parse from datetime import datetime import datetime as d - json_data = open(filename).read() - data = json.loads(json_data) - total_duration = int(float(data['duration'])) #total duration in seconds + + start = parse(nodes[0]['start']) + total_duration = int((parse(last_node['finish']) - start).total_seconds()) total_memory = [] total_threads = [] @@ -91,7 +90,7 @@ def parse_log(filename, measure): total_memory.append(0) total_threads.append(0) - now = parse(data['start']) + now = start for i in range(total_duration): start_index = 0 node_start = None @@ -99,13 +98,13 @@ def parse_log(filename, measure): x = now - for j in range(start_index, len(data['nodes'])): - node_start = parse(data['nodes'][j]['start']) - node_finish = parse(data['nodes'][j]['finish']) + for j in range(start_index, len(nodes)): + node_start = parse(nodes[j]['start']) + node_finish = parse(nodes[j]['finish']) if node_start < x and node_finish > x: - total_memory[i] += data['nodes'][j]['memory'] - total_threads[i] += data['nodes'][j]['num_threads'] + total_memory[i] += nodes[j]['memory'] + total_threads[i] += nodes[j]['num_threads'] start_index = j if node_start > x: @@ -117,11 +116,14 @@ def parse_log(filename, measure): import os -from nipype.pipeline.plugins.callback_log import log_nodes_cb, convert_logcb_to_json +from nipype.pipeline.plugins.callback_log import log_nodes_cb import logging import logging.handlers import psutil from multiprocessing import cpu_count + +from nipype.utils import draw_gantt_chart + def test_do_not_use_more_memory_then_specified(): LOG_FILENAME = 'callback.log' my_logger = logging.getLogger('callback') @@ -148,12 +150,14 @@ def test_do_not_use_more_memory_then_specified(): pipe.connect(n2, 'output1', n4, 'input1') pipe.connect(n3, 'output1', n4, 'input2') n1.inputs.input1 = 10 - pipe.config['execution']['poll_sleep_duration'] = 1 - pipe.run(plugin='ResourceMultiProc', plugin_args={'memory': max_memory, 'status_callback': log_nodes_cb}) - convert_logcb_to_json(LOG_FILENAME) - #memory usage in every second - memory, threads = parse_log(LOG_FILENAME + '.json' , 'memory') + pipe.run(plugin='ResourceMultiProc', plugin_args={'memory': max_memory, + 'status_callback': log_nodes_cb}) + + + nodes, last_node = draw_gantt_chart.log_to_json(LOG_FILENAME) + #usage in every second + memory, threads = find_metrics(nodes, last_node) result = True for m in memory: @@ -174,7 +178,6 @@ def test_do_not_use_more_memory_then_specified(): yield assert_equal, result, True, "using more threads than system has (threads is not specified by user)" os.remove(LOG_FILENAME) - os.remove(LOG_FILENAME + '.json') def test_do_not_use_more_threads_then_specified(): @@ -206,9 +209,9 @@ def test_do_not_use_more_threads_then_specified(): pipe.config['execution']['poll_sleep_duration'] = 1 pipe.run(plugin='ResourceMultiProc', plugin_args={'n_procs': max_threads, 'status_callback': log_nodes_cb}) - convert_logcb_to_json(LOG_FILENAME) - #threads usage in every second - memory, threads = parse_log(LOG_FILENAME + '.json' , 'num_threads') + nodes, last_node = draw_gantt_chart.log_to_json(LOG_FILENAME) + #usage in every second + memory, threads = find_metrics(nodes, last_node) result = True for t in threads: @@ -226,5 +229,4 @@ def test_do_not_use_more_threads_then_specified(): break yield assert_equal, result, True, "using more memory than system has (memory is not specified by user)" - os.remove(LOG_FILENAME) - os.remove(LOG_FILENAME + '.json') \ No newline at end of file + os.remove(LOG_FILENAME) \ No newline at end of file diff --git a/nipype/utils/draw_gantt_chart.py b/nipype/utils/draw_gantt_chart.py index 85ae66ddb8..5adff16c3d 100644 --- a/nipype/utils/draw_gantt_chart.py +++ b/nipype/utils/draw_gantt_chart.py @@ -78,6 +78,79 @@ def draw_nodes(start, nodes, cores, scale, colors): return result +def draw_thread_bar(start, total_duration, nodes, space_between_minutes, minute_scale): + result = "

Threads

" + + total = total_duration/60 + thread = [0 for x in range(total)] + + now = start + + #calculate nuber of threads in every second + for i in range(total): + node_start = None + node_finish = None + + for j in range(i, len(nodes)): + node_start = parser.parse(nodes[j]['start']) + node_finish = parser.parse(nodes[j]['finish']) + + if node_start <= now and node_finish >= now: + thread[i] += nodes[j]['num_threads'] + if node_start > now: + break + now += datetime.timedelta(minutes=1) + + + #draw thread bar + scale = float(space_between_minutes/float(minute_scale)) + + for i in range(len(thread)): + width = thread[i] * 10 + t = (i*scale*minute_scale) + 220 + bar = "
" + result += bar + + return result + + + +def draw_memory_bar(start, total_duration, nodes, space_between_minutes, minute_scale): + result = "

Memory

" + + total = total_duration/60 + memory = [0 for x in range(total)] + + now = start + + #calculate nuber of threads in every second + for i in range(total): + node_start = None + node_finish = None + + for j in range(i, len(nodes)): + node_start = parser.parse(nodes[j]['start']) + node_finish = parser.parse(nodes[j]['finish']) + + if node_start <= now and node_finish >= now: + memory[i] += nodes[j]['memory'] + if node_start > now: + break + now += datetime.timedelta(minutes=1) + + + #draw thread bar + scale = float(space_between_minutes/float(minute_scale)) + + for i in range(len(memory)): + width = memory[i] * 10 + t = (i*scale*minute_scale) + 220 + bar = "
" + result += bar + + return result + + ''' Generates a gantt chart in html showing the workflow execution based on a callback log file. This script was intended to be used with the ResourceMultiprocPlugin. @@ -109,7 +182,48 @@ def generate_gantt_chart(logfile, cores, minute_scale=10, space_between_minutes= #add the html header html_string = ''' - + @@ -133,6 +247,8 @@ def generate_gantt_chart(logfile, cores, minute_scale=10, space_between_minutes= #draw nodes html_string += draw_nodes(start, result, cores, scale, colors) + html_string += draw_thread_bar(start, duration, result, space_between_minutes, minute_scale) + html_string += draw_memory_bar(start, duration, result, space_between_minutes, minute_scale) #finish html html_string+= ''' From 827d2c2e6fd038f1c3104fc8e33af83c4e527e1d Mon Sep 17 00:00:00 2001 From: carolFrohlich Date: Fri, 9 Oct 2015 11:15:45 -0400 Subject: [PATCH 15/45] fix method name --- nipype/pipeline/plugins/tests/test_multiproc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nipype/pipeline/plugins/tests/test_multiproc.py b/nipype/pipeline/plugins/tests/test_multiproc.py index d7e6b1661a..5e841b78a3 100644 --- a/nipype/pipeline/plugins/tests/test_multiproc.py +++ b/nipype/pipeline/plugins/tests/test_multiproc.py @@ -26,7 +26,7 @@ def _list_outputs(self): outputs['output1'] = [1, self.inputs.input1] return outputs -def run_multiproc(): +def test_run_multiproc(): cur_dir = os.getcwd() temp_dir = mkdtemp(prefix='test_engine_') os.chdir(temp_dir) From a8f8006d9dee7c94185b017bccefebbf72a63f77 Mon Sep 17 00:00:00 2001 From: carolFrohlich Date: Fri, 9 Oct 2015 15:36:11 -0400 Subject: [PATCH 16/45] fix typos --- nipype/interfaces/io.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nipype/interfaces/io.py b/nipype/interfaces/io.py index 97812373b2..b979297f66 100644 --- a/nipype/interfaces/io.py +++ b/nipype/interfaces/io.py @@ -134,7 +134,7 @@ def _add_output_traits(self, base): # Class to track percentage of S3 file upload class ProgressPercentage(object): ''' - Call-able class instsance (via __call__ method) that displays + Callable class instsance (via __call__ method) that displays upload percentage of a file to S3 ''' @@ -367,7 +367,7 @@ def _substitute(self, pathstr): def _check_s3_base_dir(self): ''' Method to see if the datasink's base directory specifies an - S3 bucket path; it it does, it parses the path for the bucket + S3 bucket path; if it does, it parses the path for the bucket name in the form 's3://bucket_name/...' and adds a bucket attribute to the data sink instance, i.e. self.bucket @@ -451,7 +451,7 @@ def _return_aws_keys(self, creds_path): # Fetch bucket object def _fetch_bucket(self, bucket_name): ''' - Method to a return a bucket object which can be used to interact + Method to return a bucket object which can be used to interact with an AWS S3 bucket using credentials found in a local file. Parameters From 300d20c1a2b3ca7d3c0599349f9823c93907cf3e Mon Sep 17 00:00:00 2001 From: dclark87 Date: Thu, 15 Oct 2015 16:48:50 -0400 Subject: [PATCH 17/45] Update io.py --- nipype/interfaces/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nipype/interfaces/io.py b/nipype/interfaces/io.py index b979297f66..1d50062eb0 100644 --- a/nipype/interfaces/io.py +++ b/nipype/interfaces/io.py @@ -572,7 +572,7 @@ def _upload_to_s3(self, src, dst): dst_k = dst_f.replace(s3_prefix, '').lstrip('/') # Copy file up to S3 (either encrypted or not) - iflogger.info('Copying %s to S3 bucket, %s, as %s...'\ + iflogger.info('Uploading %s to S3 bucket, %s, as %s...'\ % (src_f, bucket.name, dst_f)) if self.inputs.encrypt_bucket_keys: extra_args = {'ServerSideEncryption' : 'AES256'} From 0503c23912d9d90b0010cf83b8594b1594f0d32b Mon Sep 17 00:00:00 2001 From: dclark87 Date: Thu, 15 Oct 2015 18:02:49 -0400 Subject: [PATCH 18/45] Added md5 checking for s3 --- nipype/interfaces/io.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/nipype/interfaces/io.py b/nipype/interfaces/io.py index 97812373b2..ef4cc5aab0 100644 --- a/nipype/interfaces/io.py +++ b/nipype/interfaces/io.py @@ -544,9 +544,12 @@ def _upload_to_s3(self, src, dst): ''' # Import packages + import hashlib import logging import os + from botocore.exceptions import ClientError + # Init variables bucket = self.bucket iflogger = logging.getLogger('interface') @@ -571,8 +574,25 @@ def _upload_to_s3(self, src, dst): dst_f = dst_files[src_idx] dst_k = dst_f.replace(s3_prefix, '').lstrip('/') + # See if same file is already up there + try: + dst_obj = bucket.Object(key=dst_k) + dst_md5 = dst_obj.e_tag.strip('"') + + # See if same file is already there + src_read = open(src_f, 'rb').read() + src_md5 = hashlib.md5(src_read).hexdigest() + # Move to next loop iteration + if dst_md5 == src_md5: + continue + else: + iflogger.info('Overwriting previous S3 file...') + + except ClientError as exc: + iflogger.info('New file to S3') + # Copy file up to S3 (either encrypted or not) - iflogger.info('Copying %s to S3 bucket, %s, as %s...'\ + iflogger.info('Uploading %s to S3 bucket, %s, as %s...'\ % (src_f, bucket.name, dst_f)) if self.inputs.encrypt_bucket_keys: extra_args = {'ServerSideEncryption' : 'AES256'} From f6cfad76205bebbb8f96db88b790f3c5f6ddb158 Mon Sep 17 00:00:00 2001 From: dclark87 Date: Thu, 15 Oct 2015 18:39:28 -0400 Subject: [PATCH 19/45] Added message about file already existsing --- nipype/interfaces/io.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nipype/interfaces/io.py b/nipype/interfaces/io.py index 1f1396ae45..b5c2e9d1f5 100644 --- a/nipype/interfaces/io.py +++ b/nipype/interfaces/io.py @@ -584,6 +584,7 @@ def _upload_to_s3(self, src, dst): src_md5 = hashlib.md5(src_read).hexdigest() # Move to next loop iteration if dst_md5 == src_md5: + iflogger.info('File %s already exists on S3, skipping...' % dst_f) continue else: iflogger.info('Overwriting previous S3 file...') From 186d00a14abc7c4ff3cf3934d1f1763308ed0edd Mon Sep 17 00:00:00 2001 From: dclark87 Date: Wed, 21 Oct 2015 16:02:31 -0400 Subject: [PATCH 20/45] Fixed dive by 0 bug --- nipype/interfaces/io.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nipype/interfaces/io.py b/nipype/interfaces/io.py index b5c2e9d1f5..85f53b1cd1 100644 --- a/nipype/interfaces/io.py +++ b/nipype/interfaces/io.py @@ -162,7 +162,10 @@ def __call__(self, bytes_amount): # With the lock on, print upload status with self._lock: self._seen_so_far += bytes_amount - percentage = (self._seen_so_far / self._size) * 100 + if self._size != 0: + percentage = (self._seen_so_far / self._size) * 100 + else: + percentage = 0 progress_str = '%d / %d (%.2f%%)\r'\ % (self._seen_so_far, self._size, percentage) From f77371b9649ecc2aba5581c893677a4302f2a5a5 Mon Sep 17 00:00:00 2001 From: dclark87 Date: Fri, 30 Oct 2015 16:19:00 -0400 Subject: [PATCH 21/45] Added upper/lower case support for S3 prefix --- nipype/interfaces/io.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/nipype/interfaces/io.py b/nipype/interfaces/io.py index 85f53b1cd1..89d1bf9bb0 100644 --- a/nipype/interfaces/io.py +++ b/nipype/interfaces/io.py @@ -393,6 +393,12 @@ def _check_s3_base_dir(self): sep = os.path.sep base_directory = self.inputs.base_directory + # Explicitly lower-case the "s3" + if base_directory.lower().startswith(s3_str): + base_dir_sp = base_directory.split('/') + base_dir_sp[0] = base_dir_sp[0].lower() + base_directory = '/'.join(base_dir_sp) + # Check if 's3://' in base dir if base_directory.startswith(s3_str): try: @@ -559,6 +565,12 @@ def _upload_to_s3(self, src, dst): s3_str = 's3://' s3_prefix = os.path.join(s3_str, bucket.name) + # Explicitly lower-case the "s3" + if dst.lower().startswith(s3_str): + dst_sp = dst.split('/') + dst_sp[0] = dst_sp[0].lower() + dst = '/'.join(dst_sp) + # If src is a directory, collect files (this assumes dst is a dir too) if os.path.isdir(src): src_files = [] From e2f51f61edf26b88907eea9f46af3f9bf8420235 Mon Sep 17 00:00:00 2001 From: dclark87 Date: Tue, 3 Nov 2015 16:59:28 -0500 Subject: [PATCH 22/45] Added support for both non-root and root AWS creds in DataSink --- nipype/interfaces/io.py | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/nipype/interfaces/io.py b/nipype/interfaces/io.py index 89d1bf9bb0..f8b4f16f60 100644 --- a/nipype/interfaces/io.py +++ b/nipype/interfaces/io.py @@ -144,7 +144,6 @@ def __init__(self, filename): # Import packages import threading - import os # Initialize data attributes self._filename = filename @@ -384,10 +383,6 @@ def _check_s3_base_dir(self): S3 bucket path ''' - # Import packages - import os - import sys - # Init variables s3_str = 's3://' sep = os.path.sep @@ -428,9 +423,8 @@ def _return_aws_keys(self, creds_path): Parameters ---------- creds_path : string (filepath) - path to the csv file with 'AWSAccessKeyId=' followed by access - key in the first row and 'AWSSecretAccessKey=' followed by - secret access key in the second row + path to the csv file downloaded from AWS; can either be root + or user credentials Returns ------- @@ -440,19 +434,28 @@ def _return_aws_keys(self, creds_path): string of the AWS secret access key ''' - # Import packages - import csv - # Init variables - csv_reader = csv.reader(open(creds_path, 'r')) - - # Grab csv rows - row1 = csv_reader.next()[0] - row2 = csv_reader.next()[0] + with open(creds_path, 'r') as creds_in: + # Grab csv rows + row1 = creds_in.readline() + row2 = creds_in.readline() + + # Are they root or user keys + if 'User Name' in row1: + # And split out for keys + aws_access_key_id = row2.split(',')[1] + aws_secret_access_key = row2.split(',')[2] + elif 'AWSAccessKeyId' in row1: + # And split out for keys + aws_access_key_id = row1.split('=')[1] + aws_secret_access_key = row2.split('=')[1] + else: + err_msg = 'Credentials file not recognized, check file is correct' + raise Exception(err_msg) - # And split out for keys - aws_access_key_id = row1.split('=')[1] - aws_secret_access_key = row2.split('=')[1] + # Strip any carriage return/line feeds + aws_access_key_id = aws_access_key_id.replace('\r', '').replace('\n', '') + aws_secret_access_key = aws_secret_access_key.replace('\r', '').replace('\n', '') # Return keys return aws_access_key_id, aws_secret_access_key From 350fd4a96300c87796183cd7c4bcccd72cc8cfe3 Mon Sep 17 00:00:00 2001 From: carolFrohlich Date: Wed, 25 Nov 2015 09:40:53 -0500 Subject: [PATCH 23/45] add attribute real_memory to interface, change attr memory to estimated_memory --- nipype/interfaces/base.py | 3 ++- nipype/pipeline/plugins/callback_log.py | 6 +++--- nipype/pipeline/plugins/multiproc.py | 10 +++++----- nipype/pipeline/plugins/tests/test_multiproc.py | 10 +++++----- nipype/utils/draw_gantt_chart.py | 2 +- 5 files changed, 16 insertions(+), 15 deletions(-) diff --git a/nipype/interfaces/base.py b/nipype/interfaces/base.py index 854fb44fe1..694f858b11 100644 --- a/nipype/interfaces/base.py +++ b/nipype/interfaces/base.py @@ -750,7 +750,8 @@ def __init__(self, **inputs): raise Exception('No input_spec in class: %s' % self.__class__.__name__) self.inputs = self.input_spec(**inputs) - self.memory = 1 + self.estimated_memory = 1 + self.real_memory = 0 self.num_threads = 1 @classmethod diff --git a/nipype/pipeline/plugins/callback_log.py b/nipype/pipeline/plugins/callback_log.py index 34952864b7..48a4f28637 100644 --- a/nipype/pipeline/plugins/callback_log.py +++ b/nipype/pipeline/plugins/callback_log.py @@ -7,7 +7,7 @@ def log_nodes_cb(node, status): if status == 'start': message = '{"name":' + '"' + node.name + '"' + ',"id":' + '"' +\ node._id + '"' + ',"start":' + '"' +str(datetime.datetime.now()) +\ - '"' + ',"memory":' + str(node._interface.memory) + ',"num_threads":' \ + '"' + ',"memory":' + str(node._interface.estimated_memory) + ',"num_threads":' \ + str(node._interface.num_threads) + '}' logger.debug(message) @@ -15,7 +15,7 @@ def log_nodes_cb(node, status): elif status == 'end': message = '{"name":' + '"' + node.name + '"' + ',"id":' + '"' + \ node._id + '"' + ',"finish":' + '"' + str(datetime.datetime.now()) +\ - '"' + ',"memory":' + str(node._interface.memory) + ',"num_threads":' \ + '"' + ',"memory":' + str(node._interface.estimated_memory) + ',"num_threads":' \ + str(node._interface.num_threads) + '}' logger.debug(message) @@ -23,7 +23,7 @@ def log_nodes_cb(node, status): else: message = '{"name":' + '"' + node.name + '"' + ',"id":' + '"' + \ node._id + '"' + ',"finish":' + '"' + str(datetime.datetime.now()) +\ - '"' + ',"memory":' + str(node._interface.memory) + ',"num_threads":' \ + '"' + ',"memory":' + str(node._interface.estimated_memory) + ',"num_threads":' \ + str(node._interface.num_threads) + ',"error":"True"}' logger.debug(message) \ No newline at end of file diff --git a/nipype/pipeline/plugins/multiproc.py b/nipype/pipeline/plugins/multiproc.py index 8d66be6999..a8a99325a9 100644 --- a/nipype/pipeline/plugins/multiproc.py +++ b/nipype/pipeline/plugins/multiproc.py @@ -177,7 +177,7 @@ def _send_procs_to_workers(self, updatehash=False, graph=None): busy_memory = 0 busy_processors = 0 for jobid in jobids: - busy_memory+= self.procs[jobid]._interface.memory + busy_memory+= self.procs[jobid]._interface.estimated_memory busy_processors+= self.procs[jobid]._interface.num_threads free_memory = self.memory - busy_memory @@ -190,7 +190,7 @@ def _send_procs_to_workers(self, updatehash=False, graph=None): #sort jobs ready to run first by memory and then by number of threads #The most resource consuming jobs run first - jobids = sorted(jobids, key=lambda item: (self.procs[item]._interface.memory, self.procs[item]._interface.num_threads)) + jobids = sorted(jobids, key=lambda item: (self.procs[item]._interface.estimated_memory, self.procs[item]._interface.num_threads)) logger.debug('Free memory: %d, Free processors: %d', free_memory, free_processors) @@ -198,9 +198,9 @@ def _send_procs_to_workers(self, updatehash=False, graph=None): #while have enough memory and processors for first job #submit first job on the list for jobid in jobids: - logger.debug('Next Job: %d, memory: %d, threads: %d' %(jobid, self.procs[jobid]._interface.memory, self.procs[jobid]._interface.num_threads)) + logger.debug('Next Job: %d, memory: %d, threads: %d' %(jobid, self.procs[jobid]._interface.estimated_memory, self.procs[jobid]._interface.num_threads)) - if self.procs[jobid]._interface.memory <= free_memory and self.procs[jobid]._interface.num_threads <= free_processors: + if self.procs[jobid]._interface.estimated_memory <= free_memory and self.procs[jobid]._interface.num_threads <= free_processors: logger.info('Executing: %s ID: %d' %(self.procs[jobid]._id, jobid)) executing_now.append(self.procs[jobid]) @@ -220,7 +220,7 @@ def _send_procs_to_workers(self, updatehash=False, graph=None): self.proc_done[jobid] = True self.proc_pending[jobid] = True - free_memory -= self.procs[jobid]._interface.memory + free_memory -= self.procs[jobid]._interface.estimated_memory free_processors -= self.procs[jobid]._interface.num_threads # Send job to task manager and add to pending tasks diff --git a/nipype/pipeline/plugins/tests/test_multiproc.py b/nipype/pipeline/plugins/tests/test_multiproc.py index 5e841b78a3..d2f281eadd 100644 --- a/nipype/pipeline/plugins/tests/test_multiproc.py +++ b/nipype/pipeline/plugins/tests/test_multiproc.py @@ -103,7 +103,7 @@ def find_metrics(nodes, last_node): node_finish = parse(nodes[j]['finish']) if node_start < x and node_finish > x: - total_memory[i] += nodes[j]['memory'] + total_memory[i] += nodes[j]['estimated_memory'] total_threads[i] += nodes[j]['num_threads'] start_index = j @@ -140,10 +140,10 @@ def test_do_not_use_more_memory_then_specified(): n3 = pe.Node(interface=TestInterfaceSingleNode(), name='n3') n4 = pe.Node(interface=TestInterfaceSingleNode(), name='n4') - n1.interface.memory = 1 - n2.interface.memory = 1 - n3.interface.memory = 10 - n4.interface.memory = 1 + n1.interface.estimated_memory = 1 + n2.interface.estimated_memory = 1 + n3.interface.estimated_memory = 10 + n4.interface.estimated_memory = 1 pipe.connect(n1, 'output1', n2, 'input1') pipe.connect(n1, 'output1', n3, 'input1') diff --git a/nipype/utils/draw_gantt_chart.py b/nipype/utils/draw_gantt_chart.py index 5adff16c3d..84bbc033a0 100644 --- a/nipype/utils/draw_gantt_chart.py +++ b/nipype/utils/draw_gantt_chart.py @@ -133,7 +133,7 @@ def draw_memory_bar(start, total_duration, nodes, space_between_minutes, minute_ node_finish = parser.parse(nodes[j]['finish']) if node_start <= now and node_finish >= now: - memory[i] += nodes[j]['memory'] + memory[i] += nodes[j]['estimated_memory'] if node_start > now: break now += datetime.timedelta(minutes=1) From f74fe25835c49749c85c8834d9249220646fea2e Mon Sep 17 00:00:00 2001 From: Cameron Craddock Date: Wed, 25 Nov 2015 14:55:32 +0000 Subject: [PATCH 24/45] Added real memory recording to plugn --- nipype/pipeline/plugins/callback_log.py | 8 ++++---- nipype/pipeline/plugins/multiproc.py | 19 +++++++++++++++---- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/nipype/pipeline/plugins/callback_log.py b/nipype/pipeline/plugins/callback_log.py index 48a4f28637..951a6f8291 100644 --- a/nipype/pipeline/plugins/callback_log.py +++ b/nipype/pipeline/plugins/callback_log.py @@ -7,8 +7,8 @@ def log_nodes_cb(node, status): if status == 'start': message = '{"name":' + '"' + node.name + '"' + ',"id":' + '"' +\ node._id + '"' + ',"start":' + '"' +str(datetime.datetime.now()) +\ - '"' + ',"memory":' + str(node._interface.estimated_memory) + ',"num_threads":' \ - + str(node._interface.num_threads) + '}' + '"' + ',"estimate memory":' + str(node._interface.estimated_memory) + ',"num_threads":' \ + + str(node._interface.num_threads) + '}' logger.debug(message) @@ -16,7 +16,7 @@ def log_nodes_cb(node, status): message = '{"name":' + '"' + node.name + '"' + ',"id":' + '"' + \ node._id + '"' + ',"finish":' + '"' + str(datetime.datetime.now()) +\ '"' + ',"memory":' + str(node._interface.estimated_memory) + ',"num_threads":' \ - + str(node._interface.num_threads) + '}' + + str(node._interface.num_threads) + ',"real memory":' str(node._interface.real_memory) + '}' logger.debug(message) @@ -26,4 +26,4 @@ def log_nodes_cb(node, status): '"' + ',"memory":' + str(node._interface.estimated_memory) + ',"num_threads":' \ + str(node._interface.num_threads) + ',"error":"True"}' - logger.debug(message) \ No newline at end of file + logger.debug(message) diff --git a/nipype/pipeline/plugins/multiproc.py b/nipype/pipeline/plugins/multiproc.py index a8a99325a9..b9fc5c9d20 100644 --- a/nipype/pipeline/plugins/multiproc.py +++ b/nipype/pipeline/plugins/multiproc.py @@ -13,8 +13,18 @@ from .base import (DistributedPluginBase, report_crash) -def run_node(node, updatehash): +def run_node(node, updatehash, plugin_args=None): result = dict(result=None, traceback=None) + try: + run_memory = plugin_args['memory_profile'] + except Exception: + run_memory = False + if run_memory: + import memory_profiler + proc = (node.run(), (), {'updatehash' : updatehash}) + mem_mb, retval = memory_profiler.memory_usage(proc, max_usage=True, retval=True) + result['result'] = retval + node._interface.real_memory = mem_mb[0]/1024.0 try: result['result'] = node.run(updatehash=updatehash) except: @@ -160,8 +170,9 @@ def _submit_job(self, node, updatehash=False): node.inputs.terminal_output = 'allatonce' except: pass - self._taskresult[self._taskid] = self.pool.apply_async(run_node, (node, - updatehash,), callback=release_lock) + self._taskresult[self._taskid] = self.pool.apply_async(run_node, + (node, updatehash, self.plugin_args), + callback=release_lock) return self._taskid def _send_procs_to_workers(self, updatehash=False, graph=None): @@ -263,4 +274,4 @@ def _send_procs_to_workers(self, updatehash=False, graph=None): else: break - logger.debug('No jobs waiting to execute') \ No newline at end of file + logger.debug('No jobs waiting to execute') From 1e66b864285513c3785b67e209476f6a9663d1ab Mon Sep 17 00:00:00 2001 From: Cameron Craddock Date: Wed, 25 Nov 2015 19:01:22 +0000 Subject: [PATCH 25/45] Added initial code for getting used memory of node --- nipype/interfaces/base.py | 10 +++++++++- nipype/pipeline/plugins/base.py | 5 ++++- nipype/pipeline/plugins/callback_log.py | 4 ++-- nipype/pipeline/plugins/multiproc.py | 23 ++++++++++++++--------- 4 files changed, 29 insertions(+), 13 deletions(-) diff --git a/nipype/interfaces/base.py b/nipype/interfaces/base.py index 694f858b11..a0b1110098 100644 --- a/nipype/interfaces/base.py +++ b/nipype/interfaces/base.py @@ -751,9 +751,17 @@ def __init__(self, **inputs): self.__class__.__name__) self.inputs = self.input_spec(**inputs) self.estimated_memory = 1 - self.real_memory = 0 + self._real_memory = 0 self.num_threads = 1 + @property + def real_memory(self): + return self._real_memory + + @real_memory.setter + def real_memory(self, value): + self._real_memory = value + @classmethod def help(cls, returnhelp=False): """ Prints class help diff --git a/nipype/pipeline/plugins/base.py b/nipype/pipeline/plugins/base.py index cee2c7dad5..bda811354d 100644 --- a/nipype/pipeline/plugins/base.py +++ b/nipype/pipeline/plugins/base.py @@ -416,7 +416,10 @@ def _task_finished_cb(self, jobid): logger.info('[Job finished] jobname: %s jobid: %d' % (self.procs[jobid]._id, jobid)) if self._status_callback: - self._status_callback(self.procs[jobid], 'end') + print '!!!!!!!!!!!!!!!!!!!' + print self._taskresult + print self._taskresult.keys() + self._status_callback(self.procs[jobid], 'end', self._taskresult[self.taskresultid]) # Update job and worker queues self.proc_pending[jobid] = False # update the job dependency structure diff --git a/nipype/pipeline/plugins/callback_log.py b/nipype/pipeline/plugins/callback_log.py index 951a6f8291..6fad0eee44 100644 --- a/nipype/pipeline/plugins/callback_log.py +++ b/nipype/pipeline/plugins/callback_log.py @@ -1,7 +1,7 @@ import datetime import logging -def log_nodes_cb(node, status): +def log_nodes_cb(node, status, result=None): print 'status', status logger = logging.getLogger('callback') if status == 'start': @@ -16,7 +16,7 @@ def log_nodes_cb(node, status): message = '{"name":' + '"' + node.name + '"' + ',"id":' + '"' + \ node._id + '"' + ',"finish":' + '"' + str(datetime.datetime.now()) +\ '"' + ',"memory":' + str(node._interface.estimated_memory) + ',"num_threads":' \ - + str(node._interface.num_threads) + ',"real memory":' str(node._interface.real_memory) + '}' + + str(node._interface.num_threads) + ',"real memory":' + str(result['real_memory']) + '}' logger.debug(message) diff --git a/nipype/pipeline/plugins/multiproc.py b/nipype/pipeline/plugins/multiproc.py index b9fc5c9d20..5091d25b27 100644 --- a/nipype/pipeline/plugins/multiproc.py +++ b/nipype/pipeline/plugins/multiproc.py @@ -21,16 +21,20 @@ def run_node(node, updatehash, plugin_args=None): run_memory = False if run_memory: import memory_profiler - proc = (node.run(), (), {'updatehash' : updatehash}) + proc = (node.run, (), {'updatehash' : updatehash}) mem_mb, retval = memory_profiler.memory_usage(proc, max_usage=True, retval=True) result['result'] = retval - node._interface.real_memory = mem_mb[0]/1024.0 - try: - result['result'] = node.run(updatehash=updatehash) - except: - etype, eval, etr = sys.exc_info() - result['traceback'] = format_exception(etype,eval,etr) - result['result'] = node.result + result['real_memory'] = 100 + print 'Just populated task result!!!!!!!!!!!!!!!!!!!' + print result + #node._interface.real_memory = mem_mb[0]/1024.0 + else: + try: + result['result'] = node.run(updatehash=updatehash) + except: + etype, eval, etr = sys.exc_info() + result['traceback'] = format_exception(etype,eval,etr) + result['result'] = node.result return result @@ -173,6 +177,8 @@ def _submit_job(self, node, updatehash=False): self._taskresult[self._taskid] = self.pool.apply_async(run_node, (node, updatehash, self.plugin_args), callback=release_lock) + print 'Printing on output!!!!!!!!!!' + print self._taskresult, self._taskid return self._taskid def _send_procs_to_workers(self, updatehash=False, graph=None): @@ -237,7 +243,6 @@ def _send_procs_to_workers(self, updatehash=False, graph=None): # Send job to task manager and add to pending tasks if self._status_callback: self._status_callback(self.procs[jobid], 'start') - if str2bool(self.procs[jobid].config['execution']['local_hash_check']): logger.debug('checking hash locally') try: From 716f92336add029661abc29b8eb27ce0cc494930 Mon Sep 17 00:00:00 2001 From: Cameron Craddock Date: Wed, 2 Dec 2015 03:12:07 +0000 Subject: [PATCH 26/45] Fixed logging of real memory --- nipype/interfaces/base.py | 38 +++++++++++++++++++++++-- nipype/interfaces/utility.py | 7 ++++- nipype/pipeline/plugins/base.py | 20 +++++++++---- nipype/pipeline/plugins/callback_log.py | 8 ++++-- nipype/pipeline/plugins/multiproc.py | 10 ++----- 5 files changed, 65 insertions(+), 18 deletions(-) diff --git a/nipype/interfaces/base.py b/nipype/interfaces/base.py index a0b1110098..2112cdc739 100644 --- a/nipype/interfaces/base.py +++ b/nipype/interfaces/base.py @@ -1206,9 +1206,18 @@ def run_command(runtime, output=None, timeout=0.01, redirect_x=False): The returned runtime contains a merged stdout+stderr log with timestamps """ - PIPE = subprocess.PIPE + # Import packages + try: + from memory_profiler import _get_memory + mem_prof = True + except: + mem_prof = False + + # Init variables + PIPE = subprocess.PIPE cmdline = runtime.cmdline + if redirect_x: exist_xvfb, _ = _exists_in_path('xvfb-run', runtime.environ) if not exist_xvfb: @@ -1237,6 +1246,11 @@ def run_command(runtime, output=None, timeout=0.01, redirect_x=False): result = {} errfile = os.path.join(runtime.cwd, 'stderr.nipype') outfile = os.path.join(runtime.cwd, 'stdout.nipype') + + # Init variables for memory profiling + ret = -1 + interval = 0.1 + if output == 'stream': streams = [Stream('stdout', proc.stdout), Stream('stderr', proc.stderr)] @@ -1252,8 +1266,10 @@ def _process(drain=0): else: for stream in res[0]: stream.read(drain) - while proc.returncode is None: + if mem_prof: + ret = max([ret, _get_memory(proc.pid, include_children=True)]) + time.sleep(interval) proc.poll() _process() _process(drain=1) @@ -1267,12 +1283,23 @@ def _process(drain=0): result[stream._name] = [r[2] for r in rows] temp.sort() result['merged'] = [r[1] for r in temp] + if output == 'allatonce': + if mem_prof: + while proc.returncode is None: + ret = max([ret, _get_memory(proc.pid, include_children=True)]) + time.sleep(interval) + proc.poll() stdout, stderr = proc.communicate() result['stdout'] = stdout.split('\n') result['stderr'] = stderr.split('\n') result['merged'] = '' if output == 'file': + if mem_prof: + while proc.returncode is None: + ret = max([ret, _get_memory(proc.pid, include_children=True)]) + time.sleep(interval) + proc.poll() ret_code = proc.wait() stderr.flush() stdout.flush() @@ -1280,10 +1307,17 @@ def _process(drain=0): result['stderr'] = [line.strip() for line in open(errfile).readlines()] result['merged'] = '' if output == 'none': + if mem_prof: + while proc.returncode is None: + ret = max([ret, _get_memory(proc.pid, include_children=True)]) + time.sleep(interval) + proc.poll() proc.communicate() result['stdout'] = [] result['stderr'] = [] result['merged'] = '' + + setattr(runtime, 'real_memory2', ret/1024.0) runtime.stderr = '\n'.join(result['stderr']) runtime.stdout = '\n'.join(result['stdout']) runtime.merged = result['merged'] diff --git a/nipype/interfaces/utility.py b/nipype/interfaces/utility.py index ca2bb5ba69..10effaa548 100644 --- a/nipype/interfaces/utility.py +++ b/nipype/interfaces/utility.py @@ -442,7 +442,12 @@ def _run_interface(self, runtime): if isdefined(value): args[name] = value - out = function_handle(**args) + # mem stuff + import memory_profiler + proc = (function_handle, (), args) + mem_mb, out = memory_profiler.memory_usage(proc=proc, retval=True, include_children=True, max_usage=True) + setattr(runtime, 'real_memory2', mem_mb[0]/1024.0) + #out = function_handle(**args) if len(self._output_names) == 1: self._out[self._output_names[0]] = out diff --git a/nipype/pipeline/plugins/base.py b/nipype/pipeline/plugins/base.py index bda811354d..2299bf4b23 100644 --- a/nipype/pipeline/plugins/base.py +++ b/nipype/pipeline/plugins/base.py @@ -241,7 +241,8 @@ def run(self, graph, config, updatehash=False): notrun.append(self._clean_queue(jobid, graph, result=result)) else: - self._task_finished_cb(jobid) + print "DJC: Calling task finished for %s cb from DistributedPluginBase.run"%(str(taskid)) + self._task_finished_cb(jobid, result) self._remove_node_dirs() self._clear_task(taskid) else: @@ -379,6 +380,7 @@ def _send_procs_to_workers(self, updatehash=False, graph=None): ) ): continue_with_submission = False + print "DJC: Calling task finised cb from DistributedPluginBase._send_procs_to_workers hash==true" self._task_finished_cb(jobid) self._remove_node_dirs() except Exception: @@ -395,6 +397,7 @@ def _send_procs_to_workers(self, updatehash=False, graph=None): self.procs[jobid].run() except Exception: self._clean_queue(jobid, graph) + print "DJC: Calling task finised cb from DistributedPluginBase._send_procs_to_workers continue_with_submission==true" self._task_finished_cb(jobid) self._remove_node_dirs() else: @@ -408,7 +411,7 @@ def _send_procs_to_workers(self, updatehash=False, graph=None): else: break - def _task_finished_cb(self, jobid): + def _task_finished_cb(self, jobid, result=None): """ Extract outputs and assign to inputs of dependent tasks This is called when a job is completed. @@ -416,10 +419,15 @@ def _task_finished_cb(self, jobid): logger.info('[Job finished] jobname: %s jobid: %d' % (self.procs[jobid]._id, jobid)) if self._status_callback: - print '!!!!!!!!!!!!!!!!!!!' - print self._taskresult - print self._taskresult.keys() - self._status_callback(self.procs[jobid], 'end', self._taskresult[self.taskresultid]) + if result == None: + if self._taskresult.has_key(jobid): + result = self._taskresult[jobid].get() + print 'MMMM' + print result['real_memory'], result['real_memory2'] + else: + print "DJC: %s not found, taskresult keys are: %s"%(str(jobid),":".join([str(k) for k in self._taskresult.keys()])) + result = {'real_memory' : 'nokey'} + self._status_callback(self.procs[jobid], 'end', result) # Update job and worker queues self.proc_pending[jobid] = False # update the job dependency structure diff --git a/nipype/pipeline/plugins/callback_log.py b/nipype/pipeline/plugins/callback_log.py index 6fad0eee44..d6795048df 100644 --- a/nipype/pipeline/plugins/callback_log.py +++ b/nipype/pipeline/plugins/callback_log.py @@ -2,8 +2,12 @@ import logging def log_nodes_cb(node, status, result=None): - print 'status', status logger = logging.getLogger('callback') + try: + real_mem1 = result['real_memory'] + real_mem2 = result['result'].runtime.get('real_memory2') + except Exception as exc: + real_mem1 = real_mem2 = 'N/A' if status == 'start': message = '{"name":' + '"' + node.name + '"' + ',"id":' + '"' +\ node._id + '"' + ',"start":' + '"' +str(datetime.datetime.now()) +\ @@ -16,7 +20,7 @@ def log_nodes_cb(node, status, result=None): message = '{"name":' + '"' + node.name + '"' + ',"id":' + '"' + \ node._id + '"' + ',"finish":' + '"' + str(datetime.datetime.now()) +\ '"' + ',"memory":' + str(node._interface.estimated_memory) + ',"num_threads":' \ - + str(node._interface.num_threads) + ',"real memory":' + str(result['real_memory']) + '}' + + str(node._interface.num_threads) + ',"real memory1":' + str(real_mem1) + ',"real memory2":' + str(real_mem2) + '}' logger.debug(message) diff --git a/nipype/pipeline/plugins/multiproc.py b/nipype/pipeline/plugins/multiproc.py index 5091d25b27..c4b6be1af9 100644 --- a/nipype/pipeline/plugins/multiproc.py +++ b/nipype/pipeline/plugins/multiproc.py @@ -22,12 +22,10 @@ def run_node(node, updatehash, plugin_args=None): if run_memory: import memory_profiler proc = (node.run, (), {'updatehash' : updatehash}) - mem_mb, retval = memory_profiler.memory_usage(proc, max_usage=True, retval=True) + mem_mb, retval = memory_profiler.memory_usage(proc=proc, retval=True, include_children=True, max_usage=True) result['result'] = retval - result['real_memory'] = 100 - print 'Just populated task result!!!!!!!!!!!!!!!!!!!' - print result - #node._interface.real_memory = mem_mb[0]/1024.0 + result['real_memory'] = mem_mb[0]/1024.0 + result['real_memory2'] = retval.runtime.get('real_memory2') else: try: result['result'] = node.run(updatehash=updatehash) @@ -177,8 +175,6 @@ def _submit_job(self, node, updatehash=False): self._taskresult[self._taskid] = self.pool.apply_async(run_node, (node, updatehash, self.plugin_args), callback=release_lock) - print 'Printing on output!!!!!!!!!!' - print self._taskresult, self._taskid return self._taskid def _send_procs_to_workers(self, updatehash=False, graph=None): From ff7959ac4af9766da324635bca44f5b9aa21f227 Mon Sep 17 00:00:00 2001 From: Cameron Craddock Date: Wed, 2 Dec 2015 03:51:37 +0000 Subject: [PATCH 27/45] Added per node runtime logging --- nipype/pipeline/plugins/callback_log.py | 7 ++++--- nipype/pipeline/plugins/multiproc.py | 4 ++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/nipype/pipeline/plugins/callback_log.py b/nipype/pipeline/plugins/callback_log.py index d6795048df..a20242df95 100644 --- a/nipype/pipeline/plugins/callback_log.py +++ b/nipype/pipeline/plugins/callback_log.py @@ -5,9 +5,10 @@ def log_nodes_cb(node, status, result=None): logger = logging.getLogger('callback') try: real_mem1 = result['real_memory'] - real_mem2 = result['result'].runtime.get('real_memory2') + real_mem2 = result['real_memory2'] + run_seconds = result['run_seconds'] except Exception as exc: - real_mem1 = real_mem2 = 'N/A' + real_mem1 = real_mem2 = run_seconds = 'N/A' if status == 'start': message = '{"name":' + '"' + node.name + '"' + ',"id":' + '"' +\ node._id + '"' + ',"start":' + '"' +str(datetime.datetime.now()) +\ @@ -20,7 +21,7 @@ def log_nodes_cb(node, status, result=None): message = '{"name":' + '"' + node.name + '"' + ',"id":' + '"' + \ node._id + '"' + ',"finish":' + '"' + str(datetime.datetime.now()) +\ '"' + ',"memory":' + str(node._interface.estimated_memory) + ',"num_threads":' \ - + str(node._interface.num_threads) + ',"real memory1":' + str(real_mem1) + ',"real memory2":' + str(real_mem2) + '}' + + str(node._interface.num_threads) + ',"real_memory1":' + str(real_mem1) + ',"real_memory2":' + str(real_mem2) + ',"run_seconds":' + str(run_seconds) + '}' logger.debug(message) diff --git a/nipype/pipeline/plugins/multiproc.py b/nipype/pipeline/plugins/multiproc.py index c4b6be1af9..3a5c63df35 100644 --- a/nipype/pipeline/plugins/multiproc.py +++ b/nipype/pipeline/plugins/multiproc.py @@ -21,11 +21,15 @@ def run_node(node, updatehash, plugin_args=None): run_memory = False if run_memory: import memory_profiler + import datetime proc = (node.run, (), {'updatehash' : updatehash}) + start = datetime.datetime.now() mem_mb, retval = memory_profiler.memory_usage(proc=proc, retval=True, include_children=True, max_usage=True) + runtime = (datetime.datetime.now() - start).total_seconds() result['result'] = retval result['real_memory'] = mem_mb[0]/1024.0 result['real_memory2'] = retval.runtime.get('real_memory2') + result['run_seconds'] = runtime else: try: result['result'] = node.run(updatehash=updatehash) From d25afb5b4a6668dcb6ddf06eda9a5f6702dbd598 Mon Sep 17 00:00:00 2001 From: dclark87 Date: Thu, 10 Dec 2015 10:10:52 -0500 Subject: [PATCH 28/45] Removed debugging print statements --- nipype/pipeline/plugins/base.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/nipype/pipeline/plugins/base.py b/nipype/pipeline/plugins/base.py index 2299bf4b23..ab76520844 100644 --- a/nipype/pipeline/plugins/base.py +++ b/nipype/pipeline/plugins/base.py @@ -241,7 +241,6 @@ def run(self, graph, config, updatehash=False): notrun.append(self._clean_queue(jobid, graph, result=result)) else: - print "DJC: Calling task finished for %s cb from DistributedPluginBase.run"%(str(taskid)) self._task_finished_cb(jobid, result) self._remove_node_dirs() self._clear_task(taskid) @@ -380,7 +379,6 @@ def _send_procs_to_workers(self, updatehash=False, graph=None): ) ): continue_with_submission = False - print "DJC: Calling task finised cb from DistributedPluginBase._send_procs_to_workers hash==true" self._task_finished_cb(jobid) self._remove_node_dirs() except Exception: @@ -397,7 +395,6 @@ def _send_procs_to_workers(self, updatehash=False, graph=None): self.procs[jobid].run() except Exception: self._clean_queue(jobid, graph) - print "DJC: Calling task finised cb from DistributedPluginBase._send_procs_to_workers continue_with_submission==true" self._task_finished_cb(jobid) self._remove_node_dirs() else: @@ -422,10 +419,7 @@ def _task_finished_cb(self, jobid, result=None): if result == None: if self._taskresult.has_key(jobid): result = self._taskresult[jobid].get() - print 'MMMM' - print result['real_memory'], result['real_memory2'] else: - print "DJC: %s not found, taskresult keys are: %s"%(str(jobid),":".join([str(k) for k in self._taskresult.keys()])) result = {'real_memory' : 'nokey'} self._status_callback(self.procs[jobid], 'end', result) # Update job and worker queues From 00a470bc845c6410bd885e7d77326ab42983c995 Mon Sep 17 00:00:00 2001 From: carolFrohlich Date: Wed, 30 Dec 2015 13:05:57 -0500 Subject: [PATCH 29/45] sync with master --- nipype/interfaces/fsl/model.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/nipype/interfaces/fsl/model.py b/nipype/interfaces/fsl/model.py index 7cebdbb629..369ecb28f3 100644 --- a/nipype/interfaces/fsl/model.py +++ b/nipype/interfaces/fsl/model.py @@ -250,13 +250,22 @@ def _create_ev_files( element=count, ctype=ctype, val=val) ev_txt += "\n" - if con[0] in con_map.keys(): - for fconidx in con_map[con[0]]: - ev_txt += contrast_ftest_element.substitute( - cnum=ftest_idx.index(fconidx) + 1, - element=tidx, - ctype=ctype, - val=1) + # if con[0] in con_map.keys(): + # for fconidx in con_map[con[0]]: + # ev_txt += contrast_ftest_element.substitute( + # cnum=ftest_idx.index(fconidx) + 1, + # element=tidx, + # ctype=ctype, + # val=1) + for fconidx in ftest_idx: + fval=0 + if con[0] in con_map.keys() and fconidx in con_map[con[0]]: + fval=1 + ev_txt += contrast_ftest_element.substitute( + cnum=ftest_idx.index(fconidx) + 1, + element=tidx, + ctype=ctype, + val=fval) ev_txt += "\n" # add contrast mask info From 89d7e9c418b6fa9e684508127fb07b00bce4dc30 Mon Sep 17 00:00:00 2001 From: dclark87 Date: Thu, 7 Jan 2016 18:27:01 -0500 Subject: [PATCH 30/45] Added fakes3 integration with datasink and started adding a local_copy flag to the output generation logic --- nipype/interfaces/io.py | 98 +++++++++++++++++---------- nipype/interfaces/tests/test_io.py | 104 +++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+), 36 deletions(-) diff --git a/nipype/interfaces/io.py b/nipype/interfaces/io.py index f8b4f16f60..f944114b8d 100644 --- a/nipype/interfaces/io.py +++ b/nipype/interfaces/io.py @@ -205,6 +205,11 @@ class DataSinkInputSpec(DynamicTraitedSpec, BaseInterfaceInputSpec): 'access') encrypt_bucket_keys = traits.Bool(desc='Flag indicating whether to use S3 '\ 'server-side AES-256 encryption') + # Set this if user wishes to override the bucket with their own + bucket = traits.Generic(mandatory=False, + desc='Boto3 S3 bucket for manual override of bucket') + # Set this if user wishes to have local copy of files as well + local_dir = traits.Str(desc='Copy files locally as well as to S3 bucket') # Set call-able inputs attributes def __setattr__(self, key, value): @@ -385,7 +390,6 @@ def _check_s3_base_dir(self): # Init variables s3_str = 's3://' - sep = os.path.sep base_directory = self.inputs.base_directory # Explicitly lower-case the "s3" @@ -396,11 +400,16 @@ def _check_s3_base_dir(self): # Check if 's3://' in base dir if base_directory.startswith(s3_str): + # Attempt to access bucket try: # Expects bucket name to be 's3://bucket_name/base_dir/..' - bucket_name = base_directory.split(s3_str)[1].split(sep)[0] + bucket_name = base_directory.split(s3_str)[1].split('/')[0] # Get the actual bucket object - self.bucket = self._fetch_bucket(bucket_name) + if self.inputs.bucket: + self.bucket = self.inputs.bucket + else: + self.bucket = self._fetch_bucket(bucket_name) + # Report error in case of exception except Exception as exc: err_msg = 'Unable to access S3 bucket. Error:\n%s. Exiting...'\ % exc @@ -566,7 +575,7 @@ def _upload_to_s3(self, src, dst): bucket = self.bucket iflogger = logging.getLogger('interface') s3_str = 's3://' - s3_prefix = os.path.join(s3_str, bucket.name) + s3_prefix = s3_str + bucket.name # Explicitly lower-case the "s3" if dst.lower().startswith(s3_str): @@ -629,41 +638,53 @@ def _list_outputs(self): iflogger = logging.getLogger('interface') outputs = self.output_spec().get() out_files = [] - outdir = self.inputs.base_directory + # Use hardlink use_hardlink = str2bool(config.get('execution', 'try_hard_link_datasink')) - # If base directory isn't given, assume current directory - if not isdefined(outdir): - outdir = '.' + # Set local output directory if specified + if isdefined(self.inputs.local_copy): + outdir = self.inputs.local_copy + else: + outdir = self.inputs.base_directory + # If base directory isn't given, assume current directory + if not isdefined(outdir): + outdir = '.' - # Check if base directory reflects S3-bucket upload + # Check if base directory reflects S3 bucket upload try: s3_flag = self._check_s3_base_dir() + s3dir = self.inputs.base_directory + if isdefined(self.inputs.container): + s3dir = os.path.join(s3dir, self.inputs.container) # If encountering an exception during bucket access, set output # base directory to a local folder except Exception as exc: - local_out_exception = os.path.join(os.path.expanduser('~'), - 'data_output') + if not isdefined(self.inputs.local_copy): + local_out_exception = os.path.join(os.path.expanduser('~'), + 's3_datasink_' + self.bucket.name) + outdir = local_out_exception + else: + outdir = self.inputs.local_copy + # Log local copying directory iflogger.info('Access to S3 failed! Storing outputs locally at: '\ - '%s\nError: %s' %(local_out_exception, exc)) - self.inputs.base_directory = local_out_exception - - # If not accessing S3, just set outdir to local absolute path - if not s3_flag: - outdir = os.path.abspath(outdir) + '%s\nError: %s' %(outdir, exc)) # If container input is given, append that to outdir if isdefined(self.inputs.container): outdir = os.path.join(outdir, self.inputs.container) - # Create the directory if it doesn't exist - if not os.path.exists(outdir): - try: - os.makedirs(outdir) - except OSError, inst: - if 'File exists' in inst: - pass - else: - raise(inst) + + # If doing a localy output + if not outdir.lower().startswith('s3://'): + outdir = os.path.abspath(outdir) + # Create the directory if it doesn't exist + if not os.path.exists(outdir): + try: + os.makedirs(outdir) + except OSError, inst: + if 'File exists' in inst: + pass + else: + raise(inst) # Iterate through outputs attributes {key : path(s)} for key, files in self.inputs._outputs.items(): @@ -672,10 +693,14 @@ def _list_outputs(self): iflogger.debug("key: %s files: %s" % (key, str(files))) files = filename_to_list(files) tempoutdir = outdir + if s3_flag: + s3tempoutdir = s3dir for d in key.split('.'): if d[0] == '@': continue tempoutdir = os.path.join(tempoutdir, d) + if s3_flag: + s3tempoutdir = os.path.join(s3tempoutdir, d) # flattening list if isinstance(files, list): @@ -690,25 +715,26 @@ def _list_outputs(self): src = os.path.join(src, '') dst = self._get_dst(src) dst = os.path.join(tempoutdir, dst) + s3dst = os.path.join(s3tempoutdir, dst) dst = self._substitute(dst) path, _ = os.path.split(dst) - # Create output directory if it doesnt exist - if not os.path.exists(path): - try: - os.makedirs(path) - except OSError, inst: - if 'File exists' in inst: - pass - else: - raise(inst) - # If we're uploading to S3 if s3_flag: + dst = dst.replace(outdir, self.inputs.base_directory) self._upload_to_s3(src, dst) out_files.append(dst) # Otherwise, copy locally src -> dst else: + # Create output directory if it doesnt exist + if not os.path.exists(path): + try: + os.makedirs(path) + except OSError, inst: + if 'File exists' in inst: + pass + else: + raise(inst) # If src is a file, copy it to dst if os.path.isfile(src): iflogger.debug('copyfile: %s %s' % (src, dst)) diff --git a/nipype/interfaces/tests/test_io.py b/nipype/interfaces/tests/test_io.py index 8c19ea0503..25ac843d0e 100644 --- a/nipype/interfaces/tests/test_io.py +++ b/nipype/interfaces/tests/test_io.py @@ -13,6 +13,7 @@ import nipype.interfaces.io as nio from nipype.interfaces.base import Undefined +# Check for boto noboto = False try: import boto @@ -20,6 +21,13 @@ except: noboto = True +# Check for boto3 +noboto3 = False +try: + import boto3 + from botocore.utils import fix_s3_host +except: + noboto3 = True def test_datagrabber(): dg = nio.DataGrabber() @@ -155,6 +163,102 @@ def test_datasink(): ds = nio.DataSink(infields=['test']) yield assert_true, 'test' in ds.inputs.copyable_trait_names() +# Function to check for fakes3 +def _check_for_fakes3(): + ''' + Function used internally to check for fakes3 installation + ''' + + # Import packages + import subprocess + + # Init variables + fakes3_found = False + + # Check for fakes3 + try: + ret_code = subprocess.check_call(['which', 'fakes3']) + if ret_code == 0: + fakes3_found = True + except subprocess.CalledProcessError as exc: + print 'fakes3 not found, install via \'gem install fakes3\', skipping test...' + except: + print 'Unable to check for fakes3 installation, skipping test...' + + # Return if found + return fakes3_found + +@skipif(noboto3) +# Test datasink writes to s3 properly +def test_datasink_to_s3(): + ''' + This function tests to see if the S3 functionality of a DataSink + works properly + ''' + + # Import packages + import hashlib + import tempfile + + # Init variables + ds = nio.DataSink() + bucket_name = 'test' + container = 'outputs' + attr_folder = 'text_file' + output_dir = 's3://' + bucket_name + # Local temporary filepaths for testing + fakes3_dir = tempfile.mkdtemp() + input_dir = tempfile.mkdtemp() + input_path = os.path.join(input_dir, 'datasink_test_s3.txt') + + # Check for fakes3 + fakes3_found = _check_for_fakes3() + if not fakes3_found: + return + + # Start up fake-S3 server + proc = Popen(['fakes3', '-r', fakes3_dir, '-p', '4567'], stdout=open(os.devnull, 'wb')) + + # Init boto3 s3 resource to talk with fakes3 + resource = boto3.resource(aws_access_key_id='mykey', + aws_secret_access_key='mysecret', + service_name='s3', + endpoint_url='http://localhost:4567', + use_ssl=False) + resource.meta.client.meta.events.unregister('before-sign.s3', fix_s3_host) + + # Create bucket + bucket = resource.create_bucket(Bucket=bucket_name) + + # Create input file + with open(input_path, 'wb') as f: + f.write('ABCD1234') + + # Prep datasink + ds.inputs.base_directory = output_dir + ds.inputs.container = container + ds.inputs.bucket = bucket + setattr(ds.inputs, attr_folder, input_path) + + # Run datasink + ds.run() + + # Get MD5sums and compare + key = '/'.join([container, attr_folder, os.path.basename(input_path)]) + obj = bucket.Object(key=key) + dst_md5 = obj.e_tag.replace('"', '') + src_md5 = hashlib.md5(open(input_path, 'rb').read()).hexdigest() + + # Make sure md5sums match + yield assert_equal, src_md5, dst_md5 + + # Kill fakes3 + proc.kill() + + # Delete fakes3 folder and input file + shutil.rmtree(fakes3_dir) + shutil.rmtree(input_dir) + @skipif(noboto) def test_s3datasink(): ds = nio.S3DataSink() From a70c81e9d01a8717a39ef1d171e60943b10f5db5 Mon Sep 17 00:00:00 2001 From: dclark87 Date: Fri, 8 Jan 2016 14:32:33 -0500 Subject: [PATCH 31/45] Finished adding local_copy logic and passed all unit tests --- nipype/interfaces/io.py | 38 +++++++----- nipype/interfaces/tests/test_io.py | 92 ++++++++++++++++++++++++------ 2 files changed, 98 insertions(+), 32 deletions(-) diff --git a/nipype/interfaces/io.py b/nipype/interfaces/io.py index f944114b8d..86359756f6 100644 --- a/nipype/interfaces/io.py +++ b/nipype/interfaces/io.py @@ -209,7 +209,7 @@ class DataSinkInputSpec(DynamicTraitedSpec, BaseInterfaceInputSpec): bucket = traits.Generic(mandatory=False, desc='Boto3 S3 bucket for manual override of bucket') # Set this if user wishes to have local copy of files as well - local_dir = traits.Str(desc='Copy files locally as well as to S3 bucket') + local_copy = traits.Str(desc='Copy files locally as well as to S3 bucket') # Set call-able inputs attributes def __setattr__(self, key, value): @@ -392,6 +392,10 @@ def _check_s3_base_dir(self): s3_str = 's3://' base_directory = self.inputs.base_directory + if not isdefined(base_directory): + s3_flag = False + return s3_flag + # Explicitly lower-case the "s3" if base_directory.lower().startswith(s3_str): base_dir_sp = base_directory.split('/') @@ -616,7 +620,7 @@ def _upload_to_s3(self, src, dst): else: iflogger.info('Overwriting previous S3 file...') - except ClientError as exc: + except ClientError: iflogger.info('New file to S3') # Copy file up to S3 (either encrypted or not) @@ -653,18 +657,21 @@ def _list_outputs(self): # Check if base directory reflects S3 bucket upload try: s3_flag = self._check_s3_base_dir() - s3dir = self.inputs.base_directory - if isdefined(self.inputs.container): - s3dir = os.path.join(s3dir, self.inputs.container) + if s3_flag: + s3dir = self.inputs.base_directory + if isdefined(self.inputs.container): + s3dir = os.path.join(s3dir, self.inputs.container) + else: + s3dir = '' # If encountering an exception during bucket access, set output # base directory to a local folder except Exception as exc: + s3dir = '' + s3_flag = False if not isdefined(self.inputs.local_copy): local_out_exception = os.path.join(os.path.expanduser('~'), 's3_datasink_' + self.bucket.name) outdir = local_out_exception - else: - outdir = self.inputs.local_copy # Log local copying directory iflogger.info('Access to S3 failed! Storing outputs locally at: '\ '%s\nError: %s' %(outdir, exc)) @@ -673,8 +680,8 @@ def _list_outputs(self): if isdefined(self.inputs.container): outdir = os.path.join(outdir, self.inputs.container) - # If doing a localy output - if not outdir.lower().startswith('s3://'): + # If sinking to local folder + if outdir != s3dir: outdir = os.path.abspath(outdir) # Create the directory if it doesn't exist if not os.path.exists(outdir): @@ -714,18 +721,19 @@ def _list_outputs(self): if not os.path.isfile(src): src = os.path.join(src, '') dst = self._get_dst(src) + if s3_flag: + s3dst = os.path.join(s3tempoutdir, dst) + s3dst = self._substitute(s3dst) dst = os.path.join(tempoutdir, dst) - s3dst = os.path.join(s3tempoutdir, dst) dst = self._substitute(dst) path, _ = os.path.split(dst) # If we're uploading to S3 if s3_flag: - dst = dst.replace(outdir, self.inputs.base_directory) - self._upload_to_s3(src, dst) - out_files.append(dst) + self._upload_to_s3(src, s3dst) + out_files.append(s3dst) # Otherwise, copy locally src -> dst - else: + if not s3_flag or isdefined(self.inputs.local_copy): # Create output directory if it doesnt exist if not os.path.exists(path): try: @@ -787,6 +795,8 @@ class S3DataSinkInputSpec(DynamicTraitedSpec, BaseInterfaceInputSpec): _outputs = traits.Dict(traits.Str, value={}, usedefault=True) remove_dest_dir = traits.Bool(False, usedefault=True, desc='remove dest directory when copying dirs') + # Set this if user wishes to have local copy of files as well + local_copy = traits.Str(desc='Copy files locally as well as to S3 bucket') def __setattr__(self, key, value): if key not in self.copyable_trait_names(): diff --git a/nipype/interfaces/tests/test_io.py b/nipype/interfaces/tests/test_io.py index 25ac843d0e..d5abeab223 100644 --- a/nipype/interfaces/tests/test_io.py +++ b/nipype/interfaces/tests/test_io.py @@ -177,7 +177,7 @@ def _check_for_fakes3(): # Check for fakes3 try: - ret_code = subprocess.check_call(['which', 'fakes3']) + ret_code = subprocess.check_call(['which', 'fakes3'], stdout=open(os.devnull, 'wb')) if ret_code == 0: fakes3_found = True except subprocess.CalledProcessError as exc: @@ -188,7 +188,29 @@ def _check_for_fakes3(): # Return if found return fakes3_found -@skipif(noboto3) +def _make_dummy_input(): + ''' + ''' + + # Import packages + import tempfile + + # Init variables + input_dir = tempfile.mkdtemp() + input_path = os.path.join(input_dir, 'datasink_test_s3.txt') + + # Create input file + with open(input_path, 'wb') as f: + f.write('ABCD1234') + + # Return path + return input_path + +# Check for fakes3 +fakes3 = _check_for_fakes3() + + +@skipif(noboto3 or not fakes3) # Test datasink writes to s3 properly def test_datasink_to_s3(): ''' @@ -208,13 +230,7 @@ def test_datasink_to_s3(): output_dir = 's3://' + bucket_name # Local temporary filepaths for testing fakes3_dir = tempfile.mkdtemp() - input_dir = tempfile.mkdtemp() - input_path = os.path.join(input_dir, 'datasink_test_s3.txt') - - # Check for fakes3 - fakes3_found = _check_for_fakes3() - if not fakes3_found: - return + input_path = _make_dummy_input() # Start up fake-S3 server proc = Popen(['fakes3', '-r', fakes3_dir, '-p', '4567'], stdout=open(os.devnull, 'wb')) @@ -230,10 +246,6 @@ def test_datasink_to_s3(): # Create bucket bucket = resource.create_bucket(Bucket=bucket_name) - # Create input file - with open(input_path, 'wb') as f: - f.write('ABCD1234') - # Prep datasink ds.inputs.base_directory = output_dir ds.inputs.container = container @@ -249,15 +261,59 @@ def test_datasink_to_s3(): dst_md5 = obj.e_tag.replace('"', '') src_md5 = hashlib.md5(open(input_path, 'rb').read()).hexdigest() - # Make sure md5sums match - yield assert_equal, src_md5, dst_md5 - # Kill fakes3 proc.kill() # Delete fakes3 folder and input file shutil.rmtree(fakes3_dir) - shutil.rmtree(input_dir) + shutil.rmtree(os.path.dirname(input_path)) + + # Make sure md5sums match + yield assert_equal, src_md5, dst_md5 + +# Test the local copy attribute +def test_datasink_localcopy(): + ''' + Function to validate DataSink will make local copy via local_copy + attribute + ''' + + # Import packages + import hashlib + import tempfile + + # Init variables + local_dir = tempfile.mkdtemp() + container = 'outputs' + attr_folder = 'text_file' + + # Make dummy input file and datasink + input_path = _make_dummy_input() + ds = nio.DataSink() + + # Set up datasink + ds.inputs.container = container + ds.inputs.local_copy = local_dir + setattr(ds.inputs, attr_folder, input_path) + + # Expected local copy path + local_copy = os.path.join(local_dir, container, attr_folder, + os.path.basename(input_path)) + + # Run the datasink + ds.run() + + # Check md5sums of both + src_md5 = hashlib.md5(open(input_path, 'rb').read()).hexdigest() + dst_md5 = hashlib.md5(open(local_copy, 'rb').read()).hexdigest() + + # Delete temp diretories + shutil.rmtree(os.path.dirname(input_path)) + shutil.rmtree(local_dir) + + # Perform test + yield assert_equal, src_md5, dst_md5 + @skipif(noboto) def test_s3datasink(): @@ -300,7 +356,7 @@ def test_datasink_substitutions(): shutil.rmtree(indir) shutil.rmtree(outdir) -@skipif(noboto) +@skipif(noboto or not fakes3) def test_s3datasink_substitutions(): indir = mkdtemp(prefix='-Tmp-nipype_ds_subs_in') outdir = mkdtemp(prefix='-Tmp-nipype_ds_subs_out') From 2af5c1d4de916f6cd7fd143b7eadf3796489891e Mon Sep 17 00:00:00 2001 From: dclark87 Date: Fri, 8 Jan 2016 14:36:43 -0500 Subject: [PATCH 32/45] Removed memory profiler stuff for now --- nipype/interfaces/base.py | 7 ------- nipype/interfaces/utility.py | 7 +------ nipype/pipeline/plugins/multiproc.py | 26 +++++--------------------- 3 files changed, 6 insertions(+), 34 deletions(-) diff --git a/nipype/interfaces/base.py b/nipype/interfaces/base.py index 2112cdc739..414f36932c 100644 --- a/nipype/interfaces/base.py +++ b/nipype/interfaces/base.py @@ -1207,13 +1207,6 @@ def run_command(runtime, output=None, timeout=0.01, redirect_x=False): The returned runtime contains a merged stdout+stderr log with timestamps """ - # Import packages - try: - from memory_profiler import _get_memory - mem_prof = True - except: - mem_prof = False - # Init variables PIPE = subprocess.PIPE cmdline = runtime.cmdline diff --git a/nipype/interfaces/utility.py b/nipype/interfaces/utility.py index 10effaa548..ca2bb5ba69 100644 --- a/nipype/interfaces/utility.py +++ b/nipype/interfaces/utility.py @@ -442,12 +442,7 @@ def _run_interface(self, runtime): if isdefined(value): args[name] = value - # mem stuff - import memory_profiler - proc = (function_handle, (), args) - mem_mb, out = memory_profiler.memory_usage(proc=proc, retval=True, include_children=True, max_usage=True) - setattr(runtime, 'real_memory2', mem_mb[0]/1024.0) - #out = function_handle(**args) + out = function_handle(**args) if len(self._output_names) == 1: self._out[self._output_names[0]] = out diff --git a/nipype/pipeline/plugins/multiproc.py b/nipype/pipeline/plugins/multiproc.py index 3a5c63df35..1bca2d1922 100644 --- a/nipype/pipeline/plugins/multiproc.py +++ b/nipype/pipeline/plugins/multiproc.py @@ -16,27 +16,11 @@ def run_node(node, updatehash, plugin_args=None): result = dict(result=None, traceback=None) try: - run_memory = plugin_args['memory_profile'] - except Exception: - run_memory = False - if run_memory: - import memory_profiler - import datetime - proc = (node.run, (), {'updatehash' : updatehash}) - start = datetime.datetime.now() - mem_mb, retval = memory_profiler.memory_usage(proc=proc, retval=True, include_children=True, max_usage=True) - runtime = (datetime.datetime.now() - start).total_seconds() - result['result'] = retval - result['real_memory'] = mem_mb[0]/1024.0 - result['real_memory2'] = retval.runtime.get('real_memory2') - result['run_seconds'] = runtime - else: - try: - result['result'] = node.run(updatehash=updatehash) - except: - etype, eval, etr = sys.exc_info() - result['traceback'] = format_exception(etype,eval,etr) - result['result'] = node.result + result['result'] = node.run(updatehash=updatehash) + except: + etype, eval, etr = sys.exc_info() + result['traceback'] = format_exception(etype,eval,etr) + result['result'] = node.result return result From b7e930937041bbee47b096f70ae4093c412a28be Mon Sep 17 00:00:00 2001 From: dclark87 Date: Fri, 8 Jan 2016 15:33:43 -0500 Subject: [PATCH 33/45] Removed the memory profiler code to just pull in s3 datasink code --- nipype/interfaces/base.py | 31 ------------------------- nipype/pipeline/plugins/base.py | 11 +++------ nipype/pipeline/plugins/callback_log.py | 14 ++++------- nipype/pipeline/plugins/multiproc.py | 6 ++--- 4 files changed, 10 insertions(+), 52 deletions(-) diff --git a/nipype/interfaces/base.py b/nipype/interfaces/base.py index 414f36932c..b202453c4f 100644 --- a/nipype/interfaces/base.py +++ b/nipype/interfaces/base.py @@ -751,17 +751,8 @@ def __init__(self, **inputs): self.__class__.__name__) self.inputs = self.input_spec(**inputs) self.estimated_memory = 1 - self._real_memory = 0 self.num_threads = 1 - @property - def real_memory(self): - return self._real_memory - - @real_memory.setter - def real_memory(self, value): - self._real_memory = value - @classmethod def help(cls, returnhelp=False): """ Prints class help @@ -1240,9 +1231,6 @@ def run_command(runtime, output=None, timeout=0.01, redirect_x=False): errfile = os.path.join(runtime.cwd, 'stderr.nipype') outfile = os.path.join(runtime.cwd, 'stdout.nipype') - # Init variables for memory profiling - ret = -1 - interval = 0.1 if output == 'stream': streams = [Stream('stdout', proc.stdout), Stream('stderr', proc.stderr)] @@ -1260,9 +1248,6 @@ def _process(drain=0): for stream in res[0]: stream.read(drain) while proc.returncode is None: - if mem_prof: - ret = max([ret, _get_memory(proc.pid, include_children=True)]) - time.sleep(interval) proc.poll() _process() _process(drain=1) @@ -1278,21 +1263,11 @@ def _process(drain=0): result['merged'] = [r[1] for r in temp] if output == 'allatonce': - if mem_prof: - while proc.returncode is None: - ret = max([ret, _get_memory(proc.pid, include_children=True)]) - time.sleep(interval) - proc.poll() stdout, stderr = proc.communicate() result['stdout'] = stdout.split('\n') result['stderr'] = stderr.split('\n') result['merged'] = '' if output == 'file': - if mem_prof: - while proc.returncode is None: - ret = max([ret, _get_memory(proc.pid, include_children=True)]) - time.sleep(interval) - proc.poll() ret_code = proc.wait() stderr.flush() stdout.flush() @@ -1300,17 +1275,11 @@ def _process(drain=0): result['stderr'] = [line.strip() for line in open(errfile).readlines()] result['merged'] = '' if output == 'none': - if mem_prof: - while proc.returncode is None: - ret = max([ret, _get_memory(proc.pid, include_children=True)]) - time.sleep(interval) - proc.poll() proc.communicate() result['stdout'] = [] result['stderr'] = [] result['merged'] = '' - setattr(runtime, 'real_memory2', ret/1024.0) runtime.stderr = '\n'.join(result['stderr']) runtime.stdout = '\n'.join(result['stdout']) runtime.merged = result['merged'] diff --git a/nipype/pipeline/plugins/base.py b/nipype/pipeline/plugins/base.py index ab76520844..cee2c7dad5 100644 --- a/nipype/pipeline/plugins/base.py +++ b/nipype/pipeline/plugins/base.py @@ -241,7 +241,7 @@ def run(self, graph, config, updatehash=False): notrun.append(self._clean_queue(jobid, graph, result=result)) else: - self._task_finished_cb(jobid, result) + self._task_finished_cb(jobid) self._remove_node_dirs() self._clear_task(taskid) else: @@ -408,7 +408,7 @@ def _send_procs_to_workers(self, updatehash=False, graph=None): else: break - def _task_finished_cb(self, jobid, result=None): + def _task_finished_cb(self, jobid): """ Extract outputs and assign to inputs of dependent tasks This is called when a job is completed. @@ -416,12 +416,7 @@ def _task_finished_cb(self, jobid, result=None): logger.info('[Job finished] jobname: %s jobid: %d' % (self.procs[jobid]._id, jobid)) if self._status_callback: - if result == None: - if self._taskresult.has_key(jobid): - result = self._taskresult[jobid].get() - else: - result = {'real_memory' : 'nokey'} - self._status_callback(self.procs[jobid], 'end', result) + self._status_callback(self.procs[jobid], 'end') # Update job and worker queues self.proc_pending[jobid] = False # update the job dependency structure diff --git a/nipype/pipeline/plugins/callback_log.py b/nipype/pipeline/plugins/callback_log.py index a20242df95..9d73b7b51a 100644 --- a/nipype/pipeline/plugins/callback_log.py +++ b/nipype/pipeline/plugins/callback_log.py @@ -1,14 +1,8 @@ import datetime import logging -def log_nodes_cb(node, status, result=None): +def log_nodes_cb(node, status): logger = logging.getLogger('callback') - try: - real_mem1 = result['real_memory'] - real_mem2 = result['real_memory2'] - run_seconds = result['run_seconds'] - except Exception as exc: - real_mem1 = real_mem2 = run_seconds = 'N/A' if status == 'start': message = '{"name":' + '"' + node.name + '"' + ',"id":' + '"' +\ node._id + '"' + ',"start":' + '"' +str(datetime.datetime.now()) +\ @@ -20,15 +14,15 @@ def log_nodes_cb(node, status, result=None): elif status == 'end': message = '{"name":' + '"' + node.name + '"' + ',"id":' + '"' + \ node._id + '"' + ',"finish":' + '"' + str(datetime.datetime.now()) +\ - '"' + ',"memory":' + str(node._interface.estimated_memory) + ',"num_threads":' \ - + str(node._interface.num_threads) + ',"real_memory1":' + str(real_mem1) + ',"real_memory2":' + str(real_mem2) + ',"run_seconds":' + str(run_seconds) + '}' + '"' + ',"estimate memory":' + str(node._interface.estimated_memory) + ',"num_threads":' \ + + str(node._interface.num_threads) + '}' logger.debug(message) else: message = '{"name":' + '"' + node.name + '"' + ',"id":' + '"' + \ node._id + '"' + ',"finish":' + '"' + str(datetime.datetime.now()) +\ - '"' + ',"memory":' + str(node._interface.estimated_memory) + ',"num_threads":' \ + '"' + ',"estimate memory":' + str(node._interface.estimated_memory) + ',"num_threads":' \ + str(node._interface.num_threads) + ',"error":"True"}' logger.debug(message) diff --git a/nipype/pipeline/plugins/multiproc.py b/nipype/pipeline/plugins/multiproc.py index 1bca2d1922..ec9a65905e 100644 --- a/nipype/pipeline/plugins/multiproc.py +++ b/nipype/pipeline/plugins/multiproc.py @@ -83,8 +83,8 @@ def _submit_job(self, node, updatehash=False): node.inputs.terminal_output = 'allatonce' except: pass - self._taskresult[self._taskid] = self.pool.apply_async(run_node, (node, - updatehash,)) + self._taskresult[self._taskid] = self.pool.apply_async(run_node, + (node, updatehash,)) return self._taskid def _report_crash(self, node, result=None): @@ -161,7 +161,7 @@ def _submit_job(self, node, updatehash=False): except: pass self._taskresult[self._taskid] = self.pool.apply_async(run_node, - (node, updatehash, self.plugin_args), + (node, updatehash,), callback=release_lock) return self._taskid From 0e5e0e9b2c94de19fd43f5a25ab68917e5056fa6 Mon Sep 17 00:00:00 2001 From: dclark87 Date: Fri, 8 Jan 2016 15:37:00 -0500 Subject: [PATCH 34/45] Removed unneccessary import --- nipype/interfaces/io.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/nipype/interfaces/io.py b/nipype/interfaces/io.py index 86359756f6..a5ddb41211 100644 --- a/nipype/interfaces/io.py +++ b/nipype/interfaces/io.py @@ -226,14 +226,9 @@ def __setattr__(self, key, value): # DataSink outputs class DataSinkOutputSpec(TraitedSpec): - ''' - ''' - - # Import packages - import traits.api as tapi # Init out file - out_file = tapi.Any(desc='datasink output') + out_file = traits.Any(desc='datasink output') # Custom DataSink class From 0f78025b64f16e137f4cb6cc1928e2cac8156478 Mon Sep 17 00:00:00 2001 From: dclark87 Date: Fri, 8 Jan 2016 15:43:47 -0500 Subject: [PATCH 35/45] Removed unncessary function argument --- nipype/pipeline/plugins/multiproc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nipype/pipeline/plugins/multiproc.py b/nipype/pipeline/plugins/multiproc.py index ec9a65905e..b42213f200 100644 --- a/nipype/pipeline/plugins/multiproc.py +++ b/nipype/pipeline/plugins/multiproc.py @@ -13,7 +13,7 @@ from .base import (DistributedPluginBase, report_crash) -def run_node(node, updatehash, plugin_args=None): +def run_node(node, updatehash): result = dict(result=None, traceback=None) try: result['result'] = node.run(updatehash=updatehash) From 15f3cedb22a2a4570271f124e3d8125af3ff9d52 Mon Sep 17 00:00:00 2001 From: dclark87 Date: Fri, 8 Jan 2016 15:56:31 -0500 Subject: [PATCH 36/45] Corrected Carol's in fsl interface code --- nipype/interfaces/fsl/model.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/nipype/interfaces/fsl/model.py b/nipype/interfaces/fsl/model.py index 369ecb28f3..d37f8db111 100644 --- a/nipype/interfaces/fsl/model.py +++ b/nipype/interfaces/fsl/model.py @@ -250,13 +250,7 @@ def _create_ev_files( element=count, ctype=ctype, val=val) ev_txt += "\n" - # if con[0] in con_map.keys(): - # for fconidx in con_map[con[0]]: - # ev_txt += contrast_ftest_element.substitute( - # cnum=ftest_idx.index(fconidx) + 1, - # element=tidx, - # ctype=ctype, - # val=1) + for fconidx in ftest_idx: fval=0 if con[0] in con_map.keys() and fconidx in con_map[con[0]]: @@ -266,7 +260,7 @@ def _create_ev_files( element=tidx, ctype=ctype, val=fval) - ev_txt += "\n" + ev_txt += "\n" # add contrast mask info ev_txt += contrastmask_header.substitute() From ca4bed5a8a2dcd208345a2feb6093a1737f49813 Mon Sep 17 00:00:00 2001 From: dclark87 Date: Mon, 11 Jan 2016 14:48:22 -0500 Subject: [PATCH 37/45] Removed all of the ResourceMultiProc plugin so the S3 datasink --- nipype/interfaces/base.py | 11 +- nipype/interfaces/fsl/model.py | 20 +- nipype/pipeline/plugins/__init__.py | 3 - nipype/pipeline/plugins/base.py | 7 +- nipype/pipeline/plugins/callback_log.py | 28 -- nipype/pipeline/plugins/multiproc.py | 172 +----------- .../pipeline/plugins/semaphore_singleton.py | 2 - .../pipeline/plugins/tests/test_multiproc.py | 185 +------------ nipype/utils/draw_gantt_chart.py | 261 ------------------ 9 files changed, 14 insertions(+), 675 deletions(-) delete mode 100644 nipype/pipeline/plugins/callback_log.py delete mode 100644 nipype/pipeline/plugins/semaphore_singleton.py delete mode 100644 nipype/utils/draw_gantt_chart.py diff --git a/nipype/interfaces/base.py b/nipype/interfaces/base.py index b202453c4f..ac6b7b8af4 100644 --- a/nipype/interfaces/base.py +++ b/nipype/interfaces/base.py @@ -750,8 +750,6 @@ def __init__(self, **inputs): raise Exception('No input_spec in class: %s' % self.__class__.__name__) self.inputs = self.input_spec(**inputs) - self.estimated_memory = 1 - self.num_threads = 1 @classmethod def help(cls, returnhelp=False): @@ -1197,11 +1195,9 @@ def run_command(runtime, output=None, timeout=0.01, redirect_x=False): The returned runtime contains a merged stdout+stderr log with timestamps """ - - # Init variables PIPE = subprocess.PIPE - cmdline = runtime.cmdline + cmdline = runtime.cmdline if redirect_x: exist_xvfb, _ = _exists_in_path('xvfb-run', runtime.environ) if not exist_xvfb: @@ -1230,8 +1226,6 @@ def run_command(runtime, output=None, timeout=0.01, redirect_x=False): result = {} errfile = os.path.join(runtime.cwd, 'stderr.nipype') outfile = os.path.join(runtime.cwd, 'stdout.nipype') - - if output == 'stream': streams = [Stream('stdout', proc.stdout), Stream('stderr', proc.stderr)] @@ -1247,6 +1241,7 @@ def _process(drain=0): else: for stream in res[0]: stream.read(drain) + while proc.returncode is None: proc.poll() _process() @@ -1261,7 +1256,6 @@ def _process(drain=0): result[stream._name] = [r[2] for r in rows] temp.sort() result['merged'] = [r[1] for r in temp] - if output == 'allatonce': stdout, stderr = proc.communicate() result['stdout'] = stdout.split('\n') @@ -1279,7 +1273,6 @@ def _process(drain=0): result['stdout'] = [] result['stderr'] = [] result['merged'] = '' - runtime.stderr = '\n'.join(result['stderr']) runtime.stdout = '\n'.join(result['stdout']) runtime.merged = result['merged'] diff --git a/nipype/interfaces/fsl/model.py b/nipype/interfaces/fsl/model.py index d37f8db111..c2d1c960b4 100644 --- a/nipype/interfaces/fsl/model.py +++ b/nipype/interfaces/fsl/model.py @@ -250,17 +250,14 @@ def _create_ev_files( element=count, ctype=ctype, val=val) ev_txt += "\n" - - for fconidx in ftest_idx: - fval=0 - if con[0] in con_map.keys() and fconidx in con_map[con[0]]: - fval=1 - ev_txt += contrast_ftest_element.substitute( - cnum=ftest_idx.index(fconidx) + 1, - element=tidx, - ctype=ctype, - val=fval) - ev_txt += "\n" + if con[0] in con_map.keys(): + for fconidx in con_map[con[0]]: + ev_txt += contrast_ftest_element.substitute( + cnum=ftest_idx.index(fconidx) + 1, + element=tidx, + ctype=ctype, + val=1) + ev_txt += "\n" # add contrast mask info ev_txt += contrastmask_header.substitute() @@ -1959,4 +1956,3 @@ def _list_outputs(self): self.inputs.out_vnscales_name) return outputs - diff --git a/nipype/pipeline/plugins/__init__.py b/nipype/pipeline/plugins/__init__.py index cf392f0f77..dac14301b2 100644 --- a/nipype/pipeline/plugins/__init__.py +++ b/nipype/pipeline/plugins/__init__.py @@ -9,7 +9,6 @@ from .condor import CondorPlugin from .dagman import CondorDAGManPlugin from .multiproc import MultiProcPlugin -from .multiproc import ResourceMultiProcPlugin from .ipython import IPythonPlugin from .somaflow import SomaFlowPlugin from .pbsgraph import PBSGraphPlugin @@ -17,5 +16,3 @@ from .lsf import LSFPlugin from .slurm import SLURMPlugin from .slurmgraph import SLURMGraphPlugin - -from .callback_log import log_nodes_cb diff --git a/nipype/pipeline/plugins/base.py b/nipype/pipeline/plugins/base.py index cee2c7dad5..bb8bd91aef 100644 --- a/nipype/pipeline/plugins/base.py +++ b/nipype/pipeline/plugins/base.py @@ -260,15 +260,10 @@ def run(self, graph, config, updatehash=False): graph=graph) else: logger.debug('Not submitting') - self._wait() + sleep(float(self._config['execution']['poll_sleep_duration'])) self._remove_node_dirs() report_nodes_not_run(notrun) - - - def _wait(self): - sleep(float(self._config['execution']['poll_sleep_duration'])) - def _get_result(self, taskid): raise NotImplementedError diff --git a/nipype/pipeline/plugins/callback_log.py b/nipype/pipeline/plugins/callback_log.py deleted file mode 100644 index 9d73b7b51a..0000000000 --- a/nipype/pipeline/plugins/callback_log.py +++ /dev/null @@ -1,28 +0,0 @@ -import datetime -import logging - -def log_nodes_cb(node, status): - logger = logging.getLogger('callback') - if status == 'start': - message = '{"name":' + '"' + node.name + '"' + ',"id":' + '"' +\ - node._id + '"' + ',"start":' + '"' +str(datetime.datetime.now()) +\ - '"' + ',"estimate memory":' + str(node._interface.estimated_memory) + ',"num_threads":' \ - + str(node._interface.num_threads) + '}' - - logger.debug(message) - - elif status == 'end': - message = '{"name":' + '"' + node.name + '"' + ',"id":' + '"' + \ - node._id + '"' + ',"finish":' + '"' + str(datetime.datetime.now()) +\ - '"' + ',"estimate memory":' + str(node._interface.estimated_memory) + ',"num_threads":' \ - + str(node._interface.num_threads) + '}' - - logger.debug(message) - - else: - message = '{"name":' + '"' + node.name + '"' + ',"id":' + '"' + \ - node._id + '"' + ',"finish":' + '"' + str(datetime.datetime.now()) +\ - '"' + ',"estimate memory":' + str(node._interface.estimated_memory) + ',"num_threads":' \ - + str(node._interface.num_threads) + ',"error":"True"}' - - logger.debug(message) diff --git a/nipype/pipeline/plugins/multiproc.py b/nipype/pipeline/plugins/multiproc.py index b42213f200..0f6b11c30a 100644 --- a/nipype/pipeline/plugins/multiproc.py +++ b/nipype/pipeline/plugins/multiproc.py @@ -12,7 +12,6 @@ from .base import (DistributedPluginBase, report_crash) - def run_node(node, updatehash): result = dict(result=None, traceback=None) try: @@ -23,7 +22,6 @@ def run_node(node, updatehash): result['result'] = node.result return result - class NonDaemonProcess(Process): """A non-daemon process to support internal multiprocessing. """ @@ -68,7 +66,6 @@ def __init__(self, plugin_args=None): else: self.pool = Pool(processes=n_procs) - def _get_result(self, taskid): if taskid not in self._taskresult: raise RuntimeError('Multiproc task %d not found'%taskid) @@ -84,7 +81,8 @@ def _submit_job(self, node, updatehash=False): except: pass self._taskresult[self._taskid] = self.pool.apply_async(run_node, - (node, updatehash,)) + (node, + updatehash,)) return self._taskid def _report_crash(self, node, result=None): @@ -98,169 +96,3 @@ def _report_crash(self, node, result=None): def _clear_task(self, taskid): del self._taskresult[taskid] - - - -import numpy as np -from copy import deepcopy -from ..engine import (MapNode, str2bool) -import datetime -import psutil -from ... import logging -import semaphore_singleton -logger = logging.getLogger('workflow') - -def release_lock(args): - semaphore_singleton.semaphore.release() - -class ResourceMultiProcPlugin(MultiProcPlugin): - """Execute workflow with multiprocessing not sending more jobs at once - than the system can support. - - The plugin_args input to run can be used to control the multiprocessing - execution and defining the maximum amount of memory and threads that - should be used. When those parameters are not specified, - the number of threads and memory of the system is used. - - System consuming nodes should be tagged: - memory_consuming_node.interface.memory = 8 #Gb - thread_consuming_node.interface.num_threads = 16 - - The default number of threads and memory for a node is 1. - - Currently supported options are: - - - num_thread: maximum number of threads to be executed in parallel - - memory: maximum memory that can be used at once. - - """ - - def __init__(self, plugin_args=None): - super(ResourceMultiProcPlugin, self).__init__(plugin_args=plugin_args) - self.plugin_args = plugin_args - self.processors = cpu_count() - memory = psutil.virtual_memory() - self.memory = memory.total / (1024*1024*1024) - if self.plugin_args: - if 'n_procs' in self.plugin_args: - self.processors = self.plugin_args['n_procs'] - if 'memory' in self.plugin_args: - self.memory = self.plugin_args['memory'] - - def _wait(self): - if len(self.pending_tasks) > 0: - semaphore_singleton.semaphore.acquire() - semaphore_singleton.semaphore.release() - - - def _submit_job(self, node, updatehash=False): - self._taskid += 1 - try: - if node.inputs.terminal_output == 'stream': - node.inputs.terminal_output = 'allatonce' - except: - pass - self._taskresult[self._taskid] = self.pool.apply_async(run_node, - (node, updatehash,), - callback=release_lock) - return self._taskid - - def _send_procs_to_workers(self, updatehash=False, graph=None): - """ Sends jobs to workers when system resources are available. - Check memory (gb) and cores usage before running jobs. - """ - executing_now = [] - - # Check to see if a job is available - jobids = np.flatnonzero((self.proc_pending == True) & (self.depidx.sum(axis=0) == 0).__array__()) - - #check available system resources by summing all threads and memory used - busy_memory = 0 - busy_processors = 0 - for jobid in jobids: - busy_memory+= self.procs[jobid]._interface.estimated_memory - busy_processors+= self.procs[jobid]._interface.num_threads - - free_memory = self.memory - busy_memory - free_processors = self.processors - busy_processors - - - #check all jobs without dependency not run - jobids = np.flatnonzero((self.proc_done == False) & (self.depidx.sum(axis=0) == 0).__array__()) - - - #sort jobs ready to run first by memory and then by number of threads - #The most resource consuming jobs run first - jobids = sorted(jobids, key=lambda item: (self.procs[item]._interface.estimated_memory, self.procs[item]._interface.num_threads)) - - logger.debug('Free memory: %d, Free processors: %d', free_memory, free_processors) - - - #while have enough memory and processors for first job - #submit first job on the list - for jobid in jobids: - logger.debug('Next Job: %d, memory: %d, threads: %d' %(jobid, self.procs[jobid]._interface.estimated_memory, self.procs[jobid]._interface.num_threads)) - - if self.procs[jobid]._interface.estimated_memory <= free_memory and self.procs[jobid]._interface.num_threads <= free_processors: - logger.info('Executing: %s ID: %d' %(self.procs[jobid]._id, jobid)) - executing_now.append(self.procs[jobid]) - - if isinstance(self.procs[jobid], MapNode): - try: - num_subnodes = self.procs[jobid].num_subnodes() - except Exception: - self._clean_queue(jobid, graph) - self.proc_pending[jobid] = False - continue - if num_subnodes > 1: - submit = self._submit_mapnode(jobid) - if not submit: - continue - - # change job status in appropriate queues - self.proc_done[jobid] = True - self.proc_pending[jobid] = True - - free_memory -= self.procs[jobid]._interface.estimated_memory - free_processors -= self.procs[jobid]._interface.num_threads - - # Send job to task manager and add to pending tasks - if self._status_callback: - self._status_callback(self.procs[jobid], 'start') - if str2bool(self.procs[jobid].config['execution']['local_hash_check']): - logger.debug('checking hash locally') - try: - hash_exists, _, _, _ = self.procs[ - jobid].hash_exists() - logger.debug('Hash exists %s' % str(hash_exists)) - if (hash_exists and (self.procs[jobid].overwrite == False or (self.procs[jobid].overwrite == None and not self.procs[jobid]._interface.always_run))): - self._task_finished_cb(jobid) - self._remove_node_dirs() - continue - except Exception: - self._clean_queue(jobid, graph) - self.proc_pending[jobid] = False - continue - logger.debug('Finished checking hash') - - if self.procs[jobid].run_without_submitting: - logger.debug('Running node %s on master thread' %self.procs[jobid]) - try: - self.procs[jobid].run() - except Exception: - self._clean_queue(jobid, graph) - self._task_finished_cb(jobid) - self._remove_node_dirs() - - else: - logger.debug('submitting', jobid) - tid = self._submit_job(deepcopy(self.procs[jobid]), updatehash=updatehash) - if tid is None: - self.proc_done[jobid] = False - self.proc_pending[jobid] = False - else: - self.pending_tasks.insert(0, (tid, jobid)) - else: - break - - logger.debug('No jobs waiting to execute') diff --git a/nipype/pipeline/plugins/semaphore_singleton.py b/nipype/pipeline/plugins/semaphore_singleton.py deleted file mode 100644 index 8894615a14..0000000000 --- a/nipype/pipeline/plugins/semaphore_singleton.py +++ /dev/null @@ -1,2 +0,0 @@ -import threading -semaphore = threading.Semaphore(1) \ No newline at end of file diff --git a/nipype/pipeline/plugins/tests/test_multiproc.py b/nipype/pipeline/plugins/tests/test_multiproc.py index d2f281eadd..8d9eac3e32 100644 --- a/nipype/pipeline/plugins/tests/test_multiproc.py +++ b/nipype/pipeline/plugins/tests/test_multiproc.py @@ -3,7 +3,7 @@ from tempfile import mkdtemp from shutil import rmtree -from nipype.testing import assert_equal, assert_less_equal +from nipype.testing import assert_equal import nipype.pipeline.engine as pe class InputSpec(nib.TraitedSpec): @@ -47,186 +47,3 @@ def test_run_multiproc(): yield assert_equal, result, [1, 1] os.chdir(cur_dir) rmtree(temp_dir) - - -################################ - - -class InputSpecSingleNode(nib.TraitedSpec): - input1 = nib.traits.Int(desc='a random int') - input2 = nib.traits.Int(desc='a random int') - -class OutputSpecSingleNode(nib.TraitedSpec): - output1 = nib.traits.Int(desc='a random int') - - -class TestInterfaceSingleNode(nib.BaseInterface): - input_spec = InputSpecSingleNode - output_spec = OutputSpecSingleNode - - def _run_interface(self, runtime): - runtime.returncode = 0 - return runtime - - def _list_outputs(self): - outputs = self._outputs().get() - outputs['output1'] = self.inputs.input1 - return outputs - - -def find_metrics(nodes, last_node): - import json - from dateutil.parser import parse - from datetime import datetime - import datetime as d - - - start = parse(nodes[0]['start']) - total_duration = int((parse(last_node['finish']) - start).total_seconds()) - - total_memory = [] - total_threads = [] - for i in range(total_duration): - total_memory.append(0) - total_threads.append(0) - - now = start - for i in range(total_duration): - start_index = 0 - node_start = None - node_finish = None - - x = now - - for j in range(start_index, len(nodes)): - node_start = parse(nodes[j]['start']) - node_finish = parse(nodes[j]['finish']) - - if node_start < x and node_finish > x: - total_memory[i] += nodes[j]['estimated_memory'] - total_threads[i] += nodes[j]['num_threads'] - start_index = j - - if node_start > x: - break - - now += d.timedelta(seconds=1) - - return total_memory, total_threads - - -import os -from nipype.pipeline.plugins.callback_log import log_nodes_cb -import logging -import logging.handlers -import psutil -from multiprocessing import cpu_count - -from nipype.utils import draw_gantt_chart - -def test_do_not_use_more_memory_then_specified(): - LOG_FILENAME = 'callback.log' - my_logger = logging.getLogger('callback') - my_logger.setLevel(logging.DEBUG) - - # Add the log message handler to the logger - handler = logging.FileHandler(LOG_FILENAME) - my_logger.addHandler(handler) - - max_memory = 10 - pipe = pe.Workflow(name='pipe') - n1 = pe.Node(interface=TestInterfaceSingleNode(), name='n1') - n2 = pe.Node(interface=TestInterfaceSingleNode(), name='n2') - n3 = pe.Node(interface=TestInterfaceSingleNode(), name='n3') - n4 = pe.Node(interface=TestInterfaceSingleNode(), name='n4') - - n1.interface.estimated_memory = 1 - n2.interface.estimated_memory = 1 - n3.interface.estimated_memory = 10 - n4.interface.estimated_memory = 1 - - pipe.connect(n1, 'output1', n2, 'input1') - pipe.connect(n1, 'output1', n3, 'input1') - pipe.connect(n2, 'output1', n4, 'input1') - pipe.connect(n3, 'output1', n4, 'input2') - n1.inputs.input1 = 10 - - pipe.run(plugin='ResourceMultiProc', plugin_args={'memory': max_memory, - 'status_callback': log_nodes_cb}) - - - nodes, last_node = draw_gantt_chart.log_to_json(LOG_FILENAME) - #usage in every second - memory, threads = find_metrics(nodes, last_node) - - result = True - for m in memory: - if m > max_memory: - result = False - break - - yield assert_equal, result, True - - max_threads = cpu_count() - - result = True - for t in threads: - if t > max_threads: - result = False - break - - yield assert_equal, result, True, "using more threads than system has (threads is not specified by user)" - - os.remove(LOG_FILENAME) - - -def test_do_not_use_more_threads_then_specified(): - LOG_FILENAME = 'callback.log' - my_logger = logging.getLogger('callback') - my_logger.setLevel(logging.DEBUG) - - # Add the log message handler to the logger - handler = logging.FileHandler(LOG_FILENAME) - my_logger.addHandler(handler) - - max_threads = 10 - pipe = pe.Workflow(name='pipe') - n1 = pe.Node(interface=TestInterfaceSingleNode(), name='n1') - n2 = pe.Node(interface=TestInterfaceSingleNode(), name='n2') - n3 = pe.Node(interface=TestInterfaceSingleNode(), name='n3') - n4 = pe.Node(interface=TestInterfaceSingleNode(), name='n4') - - n1.interface.num_threads = 1 - n2.interface.num_threads = 1 - n3.interface.num_threads = 10 - n4.interface.num_threads = 1 - - pipe.connect(n1, 'output1', n2, 'input1') - pipe.connect(n1, 'output1', n3, 'input1') - pipe.connect(n2, 'output1', n4, 'input1') - pipe.connect(n3, 'output1', n4, 'input2') - n1.inputs.input1 = 10 - pipe.config['execution']['poll_sleep_duration'] = 1 - pipe.run(plugin='ResourceMultiProc', plugin_args={'n_procs': max_threads, 'status_callback': log_nodes_cb}) - - nodes, last_node = draw_gantt_chart.log_to_json(LOG_FILENAME) - #usage in every second - memory, threads = find_metrics(nodes, last_node) - - result = True - for t in threads: - if t > max_threads: - result = False - break - - yield assert_equal, result, True, "using more threads than specified" - - max_memory = psutil.virtual_memory().total / (1024*1024) - result = True - for m in memory: - if m > max_memory: - result = False - break - yield assert_equal, result, True, "using more memory than system has (memory is not specified by user)" - - os.remove(LOG_FILENAME) \ No newline at end of file diff --git a/nipype/utils/draw_gantt_chart.py b/nipype/utils/draw_gantt_chart.py deleted file mode 100644 index 84bbc033a0..0000000000 --- a/nipype/utils/draw_gantt_chart.py +++ /dev/null @@ -1,261 +0,0 @@ -import json -from dateutil import parser -import datetime -import random - - -def log_to_json(logfile): - result = [] - with open(logfile, 'r') as content: - - #read file separating each line - content = content.read() - lines = content.split('\n') - - lines = [ json.loads(x) for x in lines[:-1]] - - last_node = [ x for x in lines if x.has_key('finish')][-1] - - for i, line in enumerate(lines): - #get first start it finds - if not line.has_key('start'): - continue - - #fint the end node for that start - for j in range(i+1, len(lines)): - if lines[j].has_key('finish'): - if lines[j]['id'] == line['id'] and lines[j]['name'] == line['name']: - line['finish'] = lines[j]['finish'] - line['duration'] = (parser.parse(line['finish']) - parser.parse(line['start'])).total_seconds() - result.append(line) - break - - return result, last_node - - -#total duration in seconds -def draw_lines(start, total_duration, minute_scale, scale): - result = '' - next_line = 220 - next_time = start; - num_lines = int((total_duration/60) / minute_scale) +2; - - for i in range(num_lines): - new_line = "
" - result += new_line - - time = "

" + str(next_time.hour) + ':' + str(next_time.minute) + "

"; - result += time - - next_line += minute_scale * scale - next_time += datetime.timedelta(minutes=minute_scale) - return result - -def draw_nodes(start, nodes, cores, scale, colors): - result = '' - end_times = [datetime.datetime(start.year, start.month, start.day, start.hour, start.minute, start.second) for x in range(cores)] - - for node in nodes: - node_start = parser.parse(node['start']) - node_finish = parser.parse(node['finish']) - offset = ((node_start - start).total_seconds() / 60) * scale + 220 - scale_duration = (node['duration'] / 60) * scale - if scale_duration < 5: - scale_duration = 5 - - scale_duration -= 2 - left = 60 - for j in range(len(end_times)): - if end_times[j] < node_start: - left += j * 30 - end_times[j] = datetime.datetime(node_finish.year, node_finish.month, node_finish.day, node_finish.hour, node_finish.minute, node_finish.second) - #end_times[j]+= datetime.timedelta(microseconds=node_finish.microsecond) - break - - color = random.choice(colors) - new_node = "
"; - result += new_node - return result - - -def draw_thread_bar(start, total_duration, nodes, space_between_minutes, minute_scale): - result = "

Threads

" - - total = total_duration/60 - thread = [0 for x in range(total)] - - now = start - - #calculate nuber of threads in every second - for i in range(total): - node_start = None - node_finish = None - - for j in range(i, len(nodes)): - node_start = parser.parse(nodes[j]['start']) - node_finish = parser.parse(nodes[j]['finish']) - - if node_start <= now and node_finish >= now: - thread[i] += nodes[j]['num_threads'] - if node_start > now: - break - now += datetime.timedelta(minutes=1) - - - #draw thread bar - scale = float(space_between_minutes/float(minute_scale)) - - for i in range(len(thread)): - width = thread[i] * 10 - t = (i*scale*minute_scale) + 220 - bar = "
" - result += bar - - return result - - - -def draw_memory_bar(start, total_duration, nodes, space_between_minutes, minute_scale): - result = "

Memory

" - - total = total_duration/60 - memory = [0 for x in range(total)] - - now = start - - #calculate nuber of threads in every second - for i in range(total): - node_start = None - node_finish = None - - for j in range(i, len(nodes)): - node_start = parser.parse(nodes[j]['start']) - node_finish = parser.parse(nodes[j]['finish']) - - if node_start <= now and node_finish >= now: - memory[i] += nodes[j]['estimated_memory'] - if node_start > now: - break - now += datetime.timedelta(minutes=1) - - - #draw thread bar - scale = float(space_between_minutes/float(minute_scale)) - - for i in range(len(memory)): - width = memory[i] * 10 - t = (i*scale*minute_scale) + 220 - bar = "
" - result += bar - - return result - - -''' -Generates a gantt chart in html showing the workflow execution based on a callback log file. -This script was intended to be used with the ResourceMultiprocPlugin. -The following code shows how to set up the workflow in order to generate the log file: - -# import logging -# import logging.handlers -# from nipype.pipeline.plugins.callback_log import log_nodes_cb - -# log_filename = 'callback.log' -# logger = logging.getLogger('callback') -# logger.setLevel(logging.DEBUG) -# handler = logging.FileHandler(log_filename) -# logger.addHandler(handler) - -# #create workflow -# workflow = ... - -# workflow.run(plugin='ResourceMultiProc', -# plugin_args={'num_threads':8, 'memory':12, 'status_callback': log_nodes_cb}) - -# generate_gantt_chart('callback.log', 8) -''' -def generate_gantt_chart(logfile, cores, minute_scale=10, space_between_minutes=50, colors=["#7070FF", "#4E4EB2", "#2D2D66", "#9B9BFF"]): - - result, last_node = log_to_json(logfile) - scale = space_between_minutes - - #add the html header - html_string = ''' - - - - - -
''' - - - #create the header of the report with useful information - start = parser.parse(result[0]['start']) - duration = int((parser.parse(last_node['finish']) - start).total_seconds()) - - html_string += '

Start: '+ result[0]['start'] +'

' - html_string += '

Finish: '+ last_node['finish'] +'

' - html_string += '

Duration: '+ str(duration/60) +' minutes

' - html_string += '

Nodes: '+str(len(result))+'

' - html_string += '

Cores: '+str(cores)+'

' - - - #draw lines - html_string += draw_lines(start, duration, minute_scale, scale) - - #draw nodes - html_string += draw_nodes(start, result, cores, scale, colors) - - html_string += draw_thread_bar(start, duration, result, space_between_minutes, minute_scale) - html_string += draw_memory_bar(start, duration, result, space_between_minutes, minute_scale) - - #finish html - html_string+= ''' -
- ''' - - #save file - html_file = open(logfile +'.html', 'wb') - html_file.write(html_string) - html_file.close() \ No newline at end of file From ecb05e2c8f87298dbfb4b8d9691e763975cf0c19 Mon Sep 17 00:00:00 2001 From: dclark87 Date: Tue, 12 Jan 2016 13:03:04 -0500 Subject: [PATCH 38/45] Found merge HEAD comment and removed --- nipype/pipeline/plugins/base.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/nipype/pipeline/plugins/base.py b/nipype/pipeline/plugins/base.py index 1e32af4f71..162ddd9df4 100644 --- a/nipype/pipeline/plugins/base.py +++ b/nipype/pipeline/plugins/base.py @@ -20,16 +20,11 @@ import numpy as np import scipy.sparse as ssp -<<<<<<< HEAD -from ..utils import (nx, dfs_preorder, topological_sort) -from ..engine import (MapNode, str2bool) -======= from ...utils.filemanip import savepkl, loadpkl from ...utils.misc import str2bool from ..engine.utils import (nx, dfs_preorder, topological_sort) from ..engine import MapNode ->>>>>>> 77ffab33003e8c69712bc3015c213c6979ef77ff from ... import logging From ee70359bf7e1a06eef329323c7c32f0d0b97e666 Mon Sep 17 00:00:00 2001 From: dclark87 Date: Tue, 12 Jan 2016 13:58:32 -0500 Subject: [PATCH 39/45] Removed print statements from fakes3 checker and made it a check at the beginning --- nipype/interfaces/tests/test_io.py | 39 +++++++++--------------------- 1 file changed, 11 insertions(+), 28 deletions(-) diff --git a/nipype/interfaces/tests/test_io.py b/nipype/interfaces/tests/test_io.py index 17906b3172..efdd1bb483 100644 --- a/nipype/interfaces/tests/test_io.py +++ b/nipype/interfaces/tests/test_io.py @@ -34,6 +34,17 @@ except: noboto3 = True +# Check for fakes3 +import subprocess +try: + ret_code = subprocess.check_call(['which', 'fakes3'], stdout=open(os.devnull, 'wb')) + if ret_code == 0: + fakes3_found = True + else: + fakes3_found = False +except: + fakes3_found = False + def test_datagrabber(): dg = nio.DataGrabber() yield assert_equal, dg.inputs.template, Undefined @@ -173,30 +184,6 @@ def test_datasink(): ds = nio.DataSink(infields=['test']) yield assert_true, 'test' in ds.inputs.copyable_trait_names() -# Function to check for fakes3 -def _check_for_fakes3(): - ''' - Function used internally to check for fakes3 installation - ''' - - # Import packages - import subprocess - - # Init variables - fakes3_found = False - - # Check for fakes3 - try: - ret_code = subprocess.check_call(['which', 'fakes3'], stdout=open(os.devnull, 'wb')) - if ret_code == 0: - fakes3_found = True - except subprocess.CalledProcessError as exc: - print 'fakes3 not found, install via \'gem install fakes3\', skipping test...' - except: - print 'Unable to check for fakes3 installation, skipping test...' - - # Return if found - return fakes3_found def _make_dummy_input(): ''' @@ -216,10 +203,6 @@ def _make_dummy_input(): # Return path return input_path -# Check for fakes3 -fakes3 = _check_for_fakes3() - - @skipif(noboto3 or not fakes3) # Test datasink writes to s3 properly def test_datasink_to_s3(): From 7ecaefd3ba446fa213d4b9b8ca0eff72ac720e3a Mon Sep 17 00:00:00 2001 From: dclark87 Date: Tue, 12 Jan 2016 14:00:02 -0500 Subject: [PATCH 40/45] Changed fakes3_found to fakes3 --- nipype/interfaces/tests/test_io.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nipype/interfaces/tests/test_io.py b/nipype/interfaces/tests/test_io.py index efdd1bb483..94270cda73 100644 --- a/nipype/interfaces/tests/test_io.py +++ b/nipype/interfaces/tests/test_io.py @@ -39,11 +39,11 @@ try: ret_code = subprocess.check_call(['which', 'fakes3'], stdout=open(os.devnull, 'wb')) if ret_code == 0: - fakes3_found = True + fakes3 = True else: - fakes3_found = False + fakes3 = False except: - fakes3_found = False + fakes3 = False def test_datagrabber(): dg = nio.DataGrabber() From 818da998e3e02e21c9ac37c349adf918e3a0702f Mon Sep 17 00:00:00 2001 From: dclark87 Date: Wed, 13 Jan 2016 14:27:00 -0500 Subject: [PATCH 41/45] Fixed Python3 compatibility bug in exception raising --- nipype/interfaces/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nipype/interfaces/io.py b/nipype/interfaces/io.py index eb92ec967c..1290b56b32 100644 --- a/nipype/interfaces/io.py +++ b/nipype/interfaces/io.py @@ -685,7 +685,7 @@ def _list_outputs(self): if not os.path.exists(outdir): try: os.makedirs(outdir) - except OSError, inst: + except OSError as inst: if 'File exists' in inst: pass else: From 49c14f8c58f34dc2b61a00bf28b58802786b46ac Mon Sep 17 00:00:00 2001 From: dclark87 Date: Wed, 13 Jan 2016 14:30:57 -0500 Subject: [PATCH 42/45] Made exceptions more explicit --- nipype/interfaces/tests/test_io.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nipype/interfaces/tests/test_io.py b/nipype/interfaces/tests/test_io.py index 94270cda73..ddd6ab7b3a 100644 --- a/nipype/interfaces/tests/test_io.py +++ b/nipype/interfaces/tests/test_io.py @@ -23,7 +23,7 @@ try: import boto from boto.s3.connection import S3Connection, OrdinaryCallingFormat -except: +except ImportError: noboto = True # Check for boto3 @@ -31,7 +31,7 @@ try: import boto3 from botocore.utils import fix_s3_host -except: +except ImportError: noboto3 = True # Check for fakes3 @@ -42,7 +42,7 @@ fakes3 = True else: fakes3 = False -except: +except subprocess.CalledProcessError: fakes3 = False def test_datagrabber(): From a9dd168c1e791866bb79333dcf72190365a20e3d Mon Sep 17 00:00:00 2001 From: dclark87 Date: Thu, 14 Jan 2016 13:42:06 -0500 Subject: [PATCH 43/45] Removed S3DataSink and changed dummy file writing to be Python2/3 compatible --- nipype/interfaces/io.py | 182 +++++------------- .../interfaces/tests/test_auto_S3DataSink.py | 44 ----- nipype/interfaces/tests/test_io.py | 129 ++++--------- 3 files changed, 87 insertions(+), 268 deletions(-) delete mode 100644 nipype/interfaces/tests/test_auto_S3DataSink.py diff --git a/nipype/interfaces/io.py b/nipype/interfaces/io.py index 1290b56b32..dc5decc779 100644 --- a/nipype/interfaces/io.py +++ b/nipype/interfaces/io.py @@ -205,7 +205,9 @@ class DataSinkInputSpec(DynamicTraitedSpec, BaseInterfaceInputSpec): # AWS S3 data attributes creds_path = traits.Str(desc='Filepath to AWS credentials file for S3 bucket '\ - 'access') + 'access; if not specified, the credentials will '\ + 'be taken from the AWS_ACCESS_KEY_ID and '\ + 'AWS_SECRET_ACCESS_KEY environment variables') encrypt_bucket_keys = traits.Bool(desc='Flag indicating whether to use S3 '\ 'server-side AES-256 encryption') # Set this if user wishes to override the bucket with their own @@ -426,16 +428,15 @@ def _check_s3_base_dir(self): return s3_flag # Function to return AWS secure environment variables - def _return_aws_keys(self, creds_path): + def _return_aws_keys(self): ''' Method to return AWS access key id and secret access key using credentials found in a local file. Parameters ---------- - creds_path : string (filepath) - path to the csv file downloaded from AWS; can either be root - or user credentials + self : nipype.interfaces.io.DataSink + self for instance method Returns ------- @@ -445,28 +446,38 @@ def _return_aws_keys(self, creds_path): string of the AWS secret access key ''' + # Import packages + import os + # Init variables - with open(creds_path, 'r') as creds_in: - # Grab csv rows - row1 = creds_in.readline() - row2 = creds_in.readline() - - # Are they root or user keys - if 'User Name' in row1: - # And split out for keys - aws_access_key_id = row2.split(',')[1] - aws_secret_access_key = row2.split(',')[2] - elif 'AWSAccessKeyId' in row1: - # And split out for keys - aws_access_key_id = row1.split('=')[1] - aws_secret_access_key = row2.split('=')[1] - else: - err_msg = 'Credentials file not recognized, check file is correct' - raise Exception(err_msg) + creds_path = self.inputs.creds_path + + # Check if creds exist + if creds_path and os.path.exists(creds_path): + with open(creds_path, 'r') as creds_in: + # Grab csv rows + row1 = creds_in.readline() + row2 = creds_in.readline() + + # Are they root or user keys + if 'User Name' in row1: + # And split out for keys + aws_access_key_id = row2.split(',')[1] + aws_secret_access_key = row2.split(',')[2] + elif 'AWSAccessKeyId' in row1: + # And split out for keys + aws_access_key_id = row1.split('=')[1] + aws_secret_access_key = row2.split('=')[1] + else: + err_msg = 'Credentials file not recognized, check file is correct' + raise Exception(err_msg) - # Strip any carriage return/line feeds - aws_access_key_id = aws_access_key_id.replace('\r', '').replace('\n', '') - aws_secret_access_key = aws_secret_access_key.replace('\r', '').replace('\n', '') + # Strip any carriage return/line feeds + aws_access_key_id = aws_access_key_id.replace('\r', '').replace('\n', '') + aws_secret_access_key = aws_secret_access_key.replace('\r', '').replace('\n', '') + else: + aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') + aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY') # Return keys return aws_access_key_id, aws_secret_access_key @@ -479,6 +490,8 @@ def _fetch_bucket(self, bucket_name): Parameters ---------- + self : nipype.interfaces.io.DataSink + self for instance method bucket_name : string string corresponding to the name of the bucket on S3 @@ -504,19 +517,21 @@ def _fetch_bucket(self, bucket_name): creds_path = self.inputs.creds_path iflogger = logging.getLogger('interface') + # Get AWS credentials + try: + aws_access_key_id, aws_secret_access_key = \ + self._return_aws_keys() + except Exception as exc: + err_msg = 'There was a problem extracting the AWS credentials '\ + 'from the credentials file provided: %s. Error:\n%s'\ + % (creds_path, exc) + raise Exception(err_msg) + # Try and get AWS credentials if a creds_path is specified - if creds_path: - try: - aws_access_key_id, aws_secret_access_key = \ - self._return_aws_keys(creds_path) - except Exception as exc: - err_msg = 'There was a problem extracting the AWS credentials '\ - 'from the credentials file provided: %s. Error:\n%s'\ - % (creds_path, exc) - raise Exception(err_msg) + if aws_access_key_id and aws_secret_access_key: # Init connection - iflogger.info('Connecting to S3 bucket: %s with credentials from '\ - '%s ...' % (bucket_name, creds_path)) + iflogger.info('Connecting to S3 bucket: %s with credentials...'\ + % bucket_name) # Use individual session for each instance of DataSink # Better when datasinks are being used in multi-threading, see: # http://boto3.readthedocs.org/en/latest/guide/resources.html#multithreading @@ -762,101 +777,6 @@ def _list_outputs(self): return outputs -class S3DataSinkInputSpec(DynamicTraitedSpec, BaseInterfaceInputSpec): - testing = traits.Bool(False, usedefault=True, - desc='Flag for using local fakes3 server.' - ' (for testing purposes only)') - anon = traits.Bool(False, usedefault=True, - desc='Use anonymous connection to s3') - bucket = traits.Str(mandatory=True, - desc='Amazon S3 bucket where your data is stored') - bucket_path = traits.Str('', usedefault=True, - desc='Location within your bucket to store ' - 'data.') - base_directory = Directory( - desc='Path to the base directory for storing data.') - container = traits.Str( - desc='Folder within base directory in which to store output') - parameterization = traits.Bool(True, usedefault=True, - desc='store output in parametrized structure') - strip_dir = Directory(desc='path to strip out of filename') - substitutions = InputMultiPath(traits.Tuple(traits.Str, traits.Str), - desc=('List of 2-tuples reflecting string ' - 'to substitute and string to replace ' - 'it with')) - regexp_substitutions = InputMultiPath(traits.Tuple(traits.Str, traits.Str), - desc=('List of 2-tuples reflecting a pair ' - 'of a Python regexp pattern and a ' - 'replacement string. Invoked after ' - 'string `substitutions`')) - - _outputs = traits.Dict(traits.Str, value={}, usedefault=True) - remove_dest_dir = traits.Bool(False, usedefault=True, - desc='remove dest directory when copying dirs') - # Set this if user wishes to have local copy of files as well - local_copy = traits.Str(desc='Copy files locally as well as to S3 bucket') - - def __setattr__(self, key, value): - if key not in self.copyable_trait_names(): - if not isdefined(value): - super(S3DataSinkInputSpec, self).__setattr__(key, value) - self._outputs[key] = value - else: - if key in self._outputs: - self._outputs[key] = value - super(S3DataSinkInputSpec, self).__setattr__(key, value) - - -class S3DataSink(DataSink): - """ Works exactly like DataSink, except the specified files will - also be uploaded to Amazon S3 storage in the specified bucket - and location. 'bucket_path' is the s3 analog for - 'base_directory'. - - """ - input_spec = S3DataSinkInputSpec - - def _list_outputs(self): - """Execute this module. - """ - outputs = super(S3DataSink, self)._list_outputs() - - self.localtos3(outputs['out_file']) - - return outputs - - def localtos3(self, paths): - if self.inputs.testing: - conn = S3Connection(anon=True, is_secure=False, port=4567, - host='localhost', - calling_format=OrdinaryCallingFormat()) - - else: - conn = S3Connection(anon=self.inputs.anon) - bkt = conn.get_bucket(self.inputs.bucket) - s3paths = [] - - for path in paths: - # convert local path to s3 path - bd_index = path.find(self.inputs.base_directory) - if bd_index != -1: # base_directory is in path, maintain directory structure - s3path = path[bd_index + len(self.inputs.base_directory):] # cut out base directory - if s3path[0] == os.path.sep: - s3path = s3path[1:] - else: # base_directory isn't in path, simply place all files in bucket_path folder - s3path = os.path.split(path)[1] # take filename from path - s3path = os.path.join(self.inputs.bucket_path, s3path) - if s3path[-1] == os.path.sep: - s3path = s3path[:-1] - s3paths.append(s3path) - - k = boto.s3.key.Key(bkt) - k.key = s3path - k.set_contents_from_filename(path) - - return s3paths - - class S3DataGrabberInputSpec(DynamicTraitedSpec, BaseInterfaceInputSpec): anon = traits.Bool(False, usedefault=True, desc='Use anonymous connection to s3. If this is set to True, boto may print' + diff --git a/nipype/interfaces/tests/test_auto_S3DataSink.py b/nipype/interfaces/tests/test_auto_S3DataSink.py deleted file mode 100644 index 9ef342defb..0000000000 --- a/nipype/interfaces/tests/test_auto_S3DataSink.py +++ /dev/null @@ -1,44 +0,0 @@ -# AUTO-GENERATED by tools/checkspecs.py - DO NOT EDIT -from ...testing import assert_equal -from ..io import S3DataSink - - -def test_S3DataSink_inputs(): - input_map = dict(_outputs=dict(usedefault=True, - ), - anon=dict(usedefault=True, - ), - base_directory=dict(), - bucket=dict(mandatory=True, - ), - bucket_path=dict(usedefault=True, - ), - container=dict(), - ignore_exception=dict(nohash=True, - usedefault=True, - ), - parameterization=dict(usedefault=True, - ), - regexp_substitutions=dict(), - remove_dest_dir=dict(usedefault=True, - ), - strip_dir=dict(), - substitutions=dict(), - testing=dict(usedefault=True, - ), - ) - inputs = S3DataSink.input_spec() - - for key, metadata in list(input_map.items()): - for metakey, value in list(metadata.items()): - yield assert_equal, getattr(inputs.traits()[key], metakey), value - - -def test_S3DataSink_outputs(): - output_map = dict(out_file=dict(), - ) - outputs = S3DataSink.output_spec() - - for key, metadata in list(output_map.items()): - for metakey, value in list(metadata.items()): - yield assert_equal, getattr(outputs.traits()[key], metakey), value diff --git a/nipype/interfaces/tests/test_io.py b/nipype/interfaces/tests/test_io.py index ddd6ab7b3a..c1f4ec35f5 100644 --- a/nipype/interfaces/tests/test_io.py +++ b/nipype/interfaces/tests/test_io.py @@ -185,26 +185,30 @@ def test_datasink(): yield assert_true, 'test' in ds.inputs.copyable_trait_names() +# Make dummy input file def _make_dummy_input(): ''' + Function to create a dummy file ''' # Import packages import tempfile + # Init variables input_dir = tempfile.mkdtemp() input_path = os.path.join(input_dir, 'datasink_test_s3.txt') # Create input file with open(input_path, 'wb') as f: - f.write('ABCD1234') + f.write(b'ABCD1234') # Return path return input_path -@skipif(noboto3 or not fakes3) + # Test datasink writes to s3 properly +@skipif(noboto3 or not fakes3) def test_datasink_to_s3(): ''' This function tests to see if the S3 functionality of a DataSink @@ -264,6 +268,36 @@ def test_datasink_to_s3(): # Make sure md5sums match yield assert_equal, src_md5, dst_md5 + +# Test AWS creds read from env vars +@skipif(noboto3 or not fakes3) +def test_aws_keys_from_env(): + ''' + Function to ensure the DataSink can successfully read in AWS + credentials from the environment variables + ''' + + # Import packages + import os + import nipype.interfaces.io as nio + + # Init variables + ds = nio.DataSink() + aws_access_key_id = 'ABCDACCESS' + aws_secret_access_key = 'DEFGSECRET' + + # Set env vars + os.environ['AWS_ACCESS_KEY_ID'] = aws_access_key_id + os.environ['AWS_SECRET_ACCESS_KEY'] = aws_secret_access_key + + # Call function to return creds + access_key_test, secret_key_test = ds._return_aws_keys() + + # Assert match + yield assert_equal, aws_access_key_id, access_key_test + yield assert_equal, aws_secret_access_key, secret_key_test + + # Test the local copy attribute def test_datasink_localcopy(): ''' @@ -308,19 +342,6 @@ def test_datasink_localcopy(): yield assert_equal, src_md5, dst_md5 -@skipif(noboto) -def test_s3datasink(): - ds = nio.S3DataSink() - yield assert_true, ds.inputs.parameterization - yield assert_equal, ds.inputs.base_directory, Undefined - yield assert_equal, ds.inputs.strip_dir, Undefined - yield assert_equal, ds.inputs._outputs, {} - ds = nio.S3DataSink(base_directory='foo') - yield assert_equal, ds.inputs.base_directory, 'foo' - ds = nio.S3DataSink(infields=['test']) - yield assert_true, 'test' in ds.inputs.copyable_trait_names() - - def test_datasink_substitutions(): indir = mkdtemp(prefix='-Tmp-nipype_ds_subs_in') outdir = mkdtemp(prefix='-Tmp-nipype_ds_subs_out') @@ -349,84 +370,6 @@ def test_datasink_substitutions(): shutil.rmtree(indir) shutil.rmtree(outdir) -@skipif(noboto or not fakes3) - -def test_s3datasink_substitutions(): - indir = mkdtemp(prefix='-Tmp-nipype_ds_subs_in') - outdir = mkdtemp(prefix='-Tmp-nipype_ds_subs_out') - files = [] - for n in ['ababab.n', 'xabababyz.n']: - f = os.path.join(indir, n) - files.append(f) - open(f, 'w') - - # run fakes3 server and set up bucket - fakes3dir = op.expanduser('~/fakes3') - try: - proc = Popen( - ['fakes3', '-r', fakes3dir, '-p', '4567'], stdout=open(os.devnull, 'wb')) - except OSError as ose: - if 'No such file or directory' in str(ose): - return # fakes3 not installed. OK! - raise ose - - conn = S3Connection(anon=True, is_secure=False, port=4567, - host='localhost', - calling_format=OrdinaryCallingFormat()) - conn.create_bucket('test') - - ds = nio.S3DataSink( - testing=True, - anon=True, - bucket='test', - bucket_path='output/', - parametrization=False, - base_directory=outdir, - substitutions=[('ababab', 'ABABAB')], - # end archoring ($) is used to assure operation on the filename - # instead of possible temporary directories names matches - # Patterns should be more comprehendable in the real-world usage - # cases since paths would be quite more sensible - regexp_substitutions=[(r'xABABAB(\w*)\.n$', r'a-\1-b.n'), - ('(.*%s)[-a]([^%s]*)$' % ((os.path.sep,) * 2), - r'\1!\2')]) - setattr(ds.inputs, '@outdir', files) - ds.run() - yield assert_equal, \ - sorted([os.path.basename(x) for - x in glob.glob(os.path.join(outdir, '*'))]), \ - ['!-yz-b.n', 'ABABAB.n'] # so we got re used 2nd and both patterns - - bkt = conn.get_bucket(ds.inputs.bucket) - bkt_files = list(k for k in bkt.list()) - - found = [False, False] - failed_deletes = 0 - for k in bkt_files: - if '!-yz-b.n' in k.key: - found[0] = True - try: - bkt.delete_key(k) - except: - failed_deletes += 1 - elif 'ABABAB.n' in k.key: - found[1] = True - try: - bkt.delete_key(k) - except: - failed_deletes += 1 - - # ensure delete requests were successful - yield assert_equal, failed_deletes, 0 - - # ensure both keys are found in bucket - yield assert_equal, found.count(True), 2 - - proc.kill() - shutil.rmtree(fakes3dir) - shutil.rmtree(indir) - shutil.rmtree(outdir) - def _temp_analyze_files(): """Generate temporary analyze file pair.""" From c2eedc7128f3e9553d75c2a84798a365b3fbec11 Mon Sep 17 00:00:00 2001 From: dclark87 Date: Tue, 2 Feb 2016 16:17:58 -0500 Subject: [PATCH 44/45] Added aws.rst file documenting use of new S3 capabilities in the DataSink class --- doc/users/aws.rst | 102 ++++++++++++++++++++++++++++++++++++++++++++ doc/users/index.rst | 1 + 2 files changed, 103 insertions(+) create mode 100644 doc/users/aws.rst diff --git a/doc/users/aws.rst b/doc/users/aws.rst new file mode 100644 index 0000000000..832072ba62 --- /dev/null +++ b/doc/users/aws.rst @@ -0,0 +1,102 @@ +.. _aws: + +============================================ +Using Nipype with Amazon Web Services (AWS) +============================================ +Several groups have been successfully using Nipype on AWS. This procedure +involves setting a temporary cluster using StarCluster and potentially +transferring files to/from S3. The latter is supported by Nipype through +DataSink and S3DataGrabber. + + +Using DataSink with S3 +====================== +The DataSink class now supports sending output data directly to an AWS S3 +bucket. It does this through the introduction of several input attributes to the +DataSink interface and by parsing the `base_directory` attribute. This class +uses the `boto3 `_ and +`botocore `_ Python packages to +interact with AWS. To configure the DataSink to write data to S3, the user must +set the ``base_directory`` property to an S3-style filepath. For example: + +:: + + import nipype.interfaces.io as nio + ds = nio.DataSink() + ds.inputs.base_directory = 's3://mybucket/path/to/output/dir' + +With the "s3://" prefix in the path, the DataSink knows that the output +directory to send files is on S3 in the bucket "mybucket". "path/to/output/dir" +is the relative directory path within the bucket "mybucket" where output data +will be uploaded to (NOTE: if the relative path specified contains folders that +don’t exist in the bucket, the DataSink will create them). The DataSink treats +the S3 base directory exactly as it would a local directory, maintaining support +for containers, substitutions, subfolders, "." notation, etc to route output +data appropriately. + +There are four new attributes introduced with S3-compatibility: ``creds_path``, +``encrypt_bucket_keys``, ``local_copy``, and ``bucket``. + +:: + + ds.inputs.creds_path = '/home/user/aws_creds/credentials.csv' + ds.inputs.encrypt_bucket_keys = True + ds.local_copy = '/home/user/workflow_outputs/local_backup' + +``creds_path`` is a file path where the user's AWS credentials file (typically +a csv) is stored. This credentials file should contain the AWS access key id and +secret access key and should be formatted as one of the following (these formats +are how Amazon provides the credentials file by default when first downloaded). + +Root-account user: + +:: + + AWSAccessKeyID=ABCDEFGHIJKLMNOP + AWSSecretKey=zyx123wvu456/ABC890+gHiJk + +IAM-user: + +:: + + User Name,Access Key Id,Secret Access Key + "username",ABCDEFGHIJKLMNOP,zyx123wvu456/ABC890+gHiJk + +The ``creds_path`` is necessary when writing files to a bucket that has +restricted access (almost no buckets are publicly writable). If ``creds_path`` +is not specified, the DataSink will check the ``AWS_ACCESS_KEY_ID`` and +``AWS_SECRET_ACCESS_KEY`` environment variables and use those values for bucket +access. + +``encrypt_bucket_keys`` is a boolean flag that indicates whether to encrypt the +output data on S3, using server-side AES-256 encryption. This is useful if the +data being output is sensitive and one desires an extra layer of security on the +data. By default, this is turned off. + +``local_copy`` is a string of the filepath where local copies of the output data +are stored in addition to those sent to S3. This is useful if one wants to keep +a backup version of the data stored on their local computer. By default, this is +turned off. + +``bucket`` is a boto3 Bucket object that the user can use to overwrite the +bucket specified in their ``base_directory``. This can be useful if one has to +manually create a bucket instance on their own using special credentials (or +using a mock server like `fakes3 `_). This is +typically used for developers unit-testing the DataSink class. Most users do not +need to use this attribute for actual workflows. This is an optional argument. + +Finally, the user needs only to specify the input attributes for any incoming +data to the node, and the outputs will be written to their S3 bucket. + +:: + + workflow.connect(inputnode, 'subject_id', ds, 'container') + workflow.connect(realigner, 'realigned_files', ds, 'motion') + +So, for example, outputs for sub001’s realigned_file1.nii.gz will be in: +s3://mybucket/path/to/output/dir/sub001/motion/realigned_file1.nii.gz + + +Using S3DataGrabber +====================== +Coming soon... \ No newline at end of file diff --git a/doc/users/index.rst b/doc/users/index.rst index 3a432135a6..13c1487ae0 100644 --- a/doc/users/index.rst +++ b/doc/users/index.rst @@ -38,6 +38,7 @@ spmmcr mipav nipypecmd + aws From c0d148aec7505e9c0df439c5b53d452ae478f352 Mon Sep 17 00:00:00 2001 From: dclark87 Date: Wed, 3 Feb 2016 13:15:22 -0500 Subject: [PATCH 45/45] Removed bucket from being an attribute of the DataSink and just made it a local variable; pickle is not able to pickle the Bucket object. Functionally, the DataSink is the same --- nipype/interfaces/io.py | 73 +++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 40 deletions(-) diff --git a/nipype/interfaces/io.py b/nipype/interfaces/io.py index dc5decc779..6f0ad3bc32 100644 --- a/nipype/interfaces/io.py +++ b/nipype/interfaces/io.py @@ -375,8 +375,7 @@ def _check_s3_base_dir(self): ''' Method to see if the datasink's base directory specifies an S3 bucket path; if it does, it parses the path for the bucket - name in the form 's3://bucket_name/...' and adds a bucket - attribute to the data sink instance, i.e. self.bucket + name in the form 's3://bucket_name/...' and returns it Parameters ---------- @@ -386,15 +385,19 @@ def _check_s3_base_dir(self): s3_flag : boolean flag indicating whether the base_directory contained an S3 bucket path + bucket_name : string + name of the S3 bucket to connect to; if the base directory + is not a valid S3 path, defaults to '' ''' # Init variables s3_str = 's3://' + bucket_name = '' base_directory = self.inputs.base_directory if not isdefined(base_directory): s3_flag = False - return s3_flag + return s3_flag, bucket_name # Explicitly lower-case the "s3" if base_directory.lower().startswith(s3_str): @@ -404,28 +407,15 @@ def _check_s3_base_dir(self): # Check if 's3://' in base dir if base_directory.startswith(s3_str): - # Attempt to access bucket - try: - # Expects bucket name to be 's3://bucket_name/base_dir/..' - bucket_name = base_directory.split(s3_str)[1].split('/')[0] - # Get the actual bucket object - if self.inputs.bucket: - self.bucket = self.inputs.bucket - else: - self.bucket = self._fetch_bucket(bucket_name) - # Report error in case of exception - except Exception as exc: - err_msg = 'Unable to access S3 bucket. Error:\n%s. Exiting...'\ - % exc - raise Exception(err_msg) - # Bucket access was a success, set flag + # Expects bucket name to be 's3://bucket_name/base_dir/..' + bucket_name = base_directory.split(s3_str)[1].split('/')[0] s3_flag = True # Otherwise it's just a normal datasink else: s3_flag = False # Return s3_flag - return s3_flag + return s3_flag, bucket_name # Function to return AWS secure environment variables def _return_aws_keys(self): @@ -576,7 +566,7 @@ def _fetch_bucket(self, bucket_name): return bucket # Send up to S3 method - def _upload_to_s3(self, src, dst): + def _upload_to_s3(self, bucket, src, dst): ''' Method to upload outputs to S3 bucket instead of on local disk ''' @@ -589,7 +579,6 @@ def _upload_to_s3(self, src, dst): from botocore.exceptions import ClientError # Init variables - bucket = self.bucket iflogger = logging.getLogger('interface') s3_str = 's3://' s3_prefix = s3_str + bucket.name @@ -668,30 +657,34 @@ def _list_outputs(self): outdir = '.' # Check if base directory reflects S3 bucket upload - try: - s3_flag = self._check_s3_base_dir() - if s3_flag: - s3dir = self.inputs.base_directory - if isdefined(self.inputs.container): - s3dir = os.path.join(s3dir, self.inputs.container) + s3_flag, bucket_name = self._check_s3_base_dir() + if s3_flag: + s3dir = self.inputs.base_directory + # If user overrides bucket object, use that + if self.inputs.bucket: + bucket = self.inputs.bucket + # Otherwise fetch bucket object using name else: - s3dir = '' - # If encountering an exception during bucket access, set output - # base directory to a local folder - except Exception as exc: + try: + bucket = self._fetch_bucket(bucket_name) + # If encountering an exception during bucket access, set output + # base directory to a local folder + except Exception as exc: + s3dir = '' + if not isdefined(self.inputs.local_copy): + local_out_exception = os.path.join(os.path.expanduser('~'), + 's3_datasink_' + bucket_name) + outdir = local_out_exception + # Log local copying directory + iflogger.info('Access to S3 failed! Storing outputs locally at: '\ + '%s\nError: %s' %(outdir, exc)) + else: s3dir = '' - s3_flag = False - if not isdefined(self.inputs.local_copy): - local_out_exception = os.path.join(os.path.expanduser('~'), - 's3_datasink_' + self.bucket.name) - outdir = local_out_exception - # Log local copying directory - iflogger.info('Access to S3 failed! Storing outputs locally at: '\ - '%s\nError: %s' %(outdir, exc)) # If container input is given, append that to outdir if isdefined(self.inputs.container): outdir = os.path.join(outdir, self.inputs.container) + s3dir = os.path.join(s3dir, self.inputs.container) # If sinking to local folder if outdir != s3dir: @@ -743,7 +736,7 @@ def _list_outputs(self): # If we're uploading to S3 if s3_flag: - self._upload_to_s3(src, s3dst) + self._upload_to_s3(bucket, src, s3dst) out_files.append(s3dst) # Otherwise, copy locally src -> dst if not s3_flag or isdefined(self.inputs.local_copy):