diff --git a/.circle/tests.sh b/.circle/tests.sh index 202f9c5918..0178ab91dd 100644 --- a/.circle/tests.sh +++ b/.circle/tests.sh @@ -17,8 +17,8 @@ fi # They may need to be rebalanced in the future. case ${CIRCLE_NODE_INDEX} in 0) - docker run --rm=false -it -e FSL_COURSE_DATA="/data/examples/nipype-fsl_course_data" -v $HOME/examples:/data/examples:ro -v $WORKDIR:/work -w /work nipype/nipype:py36 /usr/bin/run_pytests.sh && \ - docker run --rm=false -it -e FSL_COURSE_DATA="/data/examples/nipype-fsl_course_data" -v $HOME/examples:/data/examples:ro -v $WORKDIR:/work -w /work nipype/nipype:py27 /usr/bin/run_pytests.sh && \ + docker run --rm=false -it -e CI_SKIP_TEST=1 -e NIPYPE_RESOURCE_MONITOR=1 -e FSL_COURSE_DATA="/data/examples/nipype-fsl_course_data" -v $HOME/examples:/data/examples:ro -v $WORKDIR:/work -w /work nipype/nipype:py36 /usr/bin/run_pytests.sh && \ + docker run --rm=false -it -e CI_SKIP_TEST=1 -e NIPYPE_RESOURCE_MONITOR=1 -e FSL_COURSE_DATA="/data/examples/nipype-fsl_course_data" -v $HOME/examples:/data/examples:ro -v $WORKDIR:/work -w /work nipype/nipype:py27 /usr/bin/run_pytests.sh && \ docker run --rm=false -it -v $WORKDIR:/work -w /src/nipype/doc --entrypoint=/usr/bin/run_builddocs.sh nipype/nipype:py36 /usr/bin/run_builddocs.sh && \ docker run --rm=false -it -v $HOME/examples:/data/examples:ro -v $WORKDIR:/work -w /work nipype/nipype:py36 /usr/bin/run_examples.sh test_spm Linear /data/examples/ workflow3d && \ docker run --rm=false -it -v $HOME/examples:/data/examples:ro -v $WORKDIR:/work -w /work nipype/nipype:py36 /usr/bin/run_examples.sh test_spm Linear /data/examples/ workflow4d @@ -30,8 +30,8 @@ case ${CIRCLE_NODE_INDEX} in exitcode=$? ;; 2) - docker run --rm=false -it -e NIPYPE_NUMBER_OF_CPUS=4 -v $HOME/examples:/data/examples:ro -v $WORKDIR:/work -w /work nipype/nipype:py27 /usr/bin/run_examples.sh fmri_spm_nested MultiProc /data/examples/ level1 && \ - docker run --rm=false -it -e NIPYPE_NUMBER_OF_CPUS=4 -v $HOME/examples:/data/examples:ro -v $WORKDIR:/work -w /work nipype/nipype:py36 /usr/bin/run_examples.sh fmri_spm_nested MultiProc /data/examples/ l2pipeline + docker run --rm=false -it -e NIPYPE_NUMBER_OF_CPUS=4 -v $HOME/examples:/data/examples:ro -v $WORKDIR:/work -w /work nipype/nipype:py36 /usr/bin/run_examples.sh fmri_spm_nested MultiProc /data/examples/ level1 && \ + docker run --rm=false -it -e NIPYPE_NUMBER_OF_CPUS=4 -e NIPYPE_RESOURCE_MONITOR=1 -v $HOME/examples:/data/examples:ro -v $WORKDIR:/work -w /work nipype/nipype:py27 /usr/bin/run_examples.sh fmri_spm_nested MultiProc /data/examples/ l2pipeline exitcode=$? ;; 3) diff --git a/.travis.yml b/.travis.yml index be611996f6..a7630ca911 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,10 +8,10 @@ python: - 3.5 - 3.6 env: -- INSTALL_DEB_DEPENDECIES=true NIPYPE_EXTRAS="doc,tests,fmri,profiler" -- INSTALL_DEB_DEPENDECIES=false NIPYPE_EXTRAS="doc,tests,fmri,profiler" -- INSTALL_DEB_DEPENDECIES=true NIPYPE_EXTRAS="doc,tests,fmri,profiler,duecredit" -- INSTALL_DEB_DEPENDECIES=true NIPYPE_EXTRAS="doc,tests,fmri,profiler" PIP_FLAGS="--pre" +- INSTALL_DEB_DEPENDECIES=true NIPYPE_EXTRAS="doc,tests,fmri,profiler" CI_SKIP_TEST=1 +- INSTALL_DEB_DEPENDECIES=false NIPYPE_EXTRAS="doc,tests,fmri,profiler" CI_SKIP_TEST=1 +- INSTALL_DEB_DEPENDECIES=true NIPYPE_EXTRAS="doc,tests,fmri,profiler,duecredit" CI_SKIP_TEST=1 +- INSTALL_DEB_DEPENDECIES=true NIPYPE_EXTRAS="doc,tests,fmri,profiler" PIP_FLAGS="--pre" CI_SKIP_TEST=1 before_install: - function apt_inst { if $INSTALL_DEB_DEPENDECIES; then sudo rm -rf /dev/shm; fi && diff --git a/doc/users/config_file.rst b/doc/users/config_file.rst index 7d55cc522d..c89406faf1 100644 --- a/doc/users/config_file.rst +++ b/doc/users/config_file.rst @@ -14,48 +14,52 @@ Logging ~~~~~~~ *workflow_level* - How detailed the logs regarding workflow should be (possible values: - ``INFO`` and ``DEBUG``; default value: ``INFO``) -*filemanip_level* - How detailed the logs regarding file operations (for example overwriting - warning) should be (possible values: ``INFO`` and ``DEBUG``; default value: - ``INFO``) + How detailed the logs regarding workflow should be (possible values: + ``INFO`` and ``DEBUG``; default value: ``INFO``) +*utils_level* + How detailed the logs regarding nipype utils, like file operations + (for example overwriting warning) or the resource profiler, should be + (possible values: ``INFO`` and ``DEBUG``; default value: + ``INFO``) *interface_level* - How detailed the logs regarding interface execution should be (possible - values: ``INFO`` and ``DEBUG``; default value: ``INFO``) + How detailed the logs regarding interface execution should be (possible + values: ``INFO`` and ``DEBUG``; default value: ``INFO``) +*filemanip_level* (deprecated as of 1.0) + How detailed the logs regarding file operations (for example overwriting + warning) should be (possible values: ``INFO`` and ``DEBUG``) *log_to_file* Indicates whether logging should also send the output to a file (possible values: ``true`` and ``false``; default value: ``false``) *log_directory* - Where to store logs. (string, default value: home directory) + Where to store logs. (string, default value: home directory) *log_size* - Size of a single log file. (integer, default value: 254000) + Size of a single log file. (integer, default value: 254000) *log_rotate* - How many rotation should the log file make. (integer, default value: 4) + How many rotation should the log file make. (integer, default value: 4) Execution ~~~~~~~~~ *plugin* - This defines which execution plugin to use. (possible values: ``Linear``, - ``MultiProc``, ``SGE``, ``IPython``; default value: ``Linear``) + This defines which execution plugin to use. (possible values: ``Linear``, + ``MultiProc``, ``SGE``, ``IPython``; default value: ``Linear``) *stop_on_first_crash* - Should the workflow stop upon first node crashing or try to execute as many - nodes as possible? (possible values: ``true`` and ``false``; default value: - ``false``) + Should the workflow stop upon first node crashing or try to execute as many + nodes as possible? (possible values: ``true`` and ``false``; default value: + ``false``) *stop_on_first_rerun* - Should the workflow stop upon first node trying to recompute (by that we - mean rerunning a node that has been run before - this can happen due changed - inputs and/or hash_method since the last run). (possible values: ``true`` - and ``false``; default value: ``false``) + Should the workflow stop upon first node trying to recompute (by that we + mean rerunning a node that has been run before - this can happen due changed + inputs and/or hash_method since the last run). (possible values: ``true`` + and ``false``; default value: ``false``) *hash_method* - Should the input files be checked for changes using their content (slow, but - 100% accurate) or just their size and modification date (fast, but - potentially prone to errors)? (possible values: ``content`` and - ``timestamp``; default value: ``timestamp``) + Should the input files be checked for changes using their content (slow, but + 100% accurate) or just their size and modification date (fast, but + potentially prone to errors)? (possible values: ``content`` and + ``timestamp``; default value: ``timestamp``) *keep_inputs* Ensures that all inputs that are created in the nodes working directory are @@ -63,44 +67,44 @@ Execution value: ``false``) *single_thread_matlab* - Should all of the Matlab interfaces (including SPM) use only one thread? - This is useful if you are parallelizing your workflow using MultiProc or - IPython on a single multicore machine. (possible values: ``true`` and - ``false``; default value: ``true``) + Should all of the Matlab interfaces (including SPM) use only one thread? + This is useful if you are parallelizing your workflow using MultiProc or + IPython on a single multicore machine. (possible values: ``true`` and + ``false``; default value: ``true``) *display_variable* - What ``DISPLAY`` variable should all command line interfaces be - run with. This is useful if you are using `xnest - `_ - or `Xvfb `_ - and you would like to redirect all spawned windows to - it. (possible values: any X server address; default value: not - set) + What ``DISPLAY`` variable should all command line interfaces be + run with. This is useful if you are using `xnest + `_ + or `Xvfb `_ + and you would like to redirect all spawned windows to + it. (possible values: any X server address; default value: not + set) *remove_unnecessary_outputs* - This will remove any interface outputs not needed by the workflow. If the - required outputs from a node changes, rerunning the workflow will rerun the - node. Outputs of leaf nodes (nodes whose outputs are not connected to any - other nodes) will never be deleted independent of this parameter. (possible - values: ``true`` and ``false``; default value: ``true``) + This will remove any interface outputs not needed by the workflow. If the + required outputs from a node changes, rerunning the workflow will rerun the + node. Outputs of leaf nodes (nodes whose outputs are not connected to any + other nodes) will never be deleted independent of this parameter. (possible + values: ``true`` and ``false``; default value: ``true``) *try_hard_link_datasink* - When the DataSink is used to produce an orginized output file outside - of nipypes internal cache structure, a file system hard link will be - attempted first. A hard link allow multiple file paths to point to the - same physical storage location on disk if the conditions allow. By - refering to the same physical file on disk (instead of copying files - byte-by-byte) we can avoid unnecessary data duplication. If hard links - are not supported for the source or destination paths specified, then - a standard byte-by-byte copy is used. (possible values: ``true`` and - ``false``; default value: ``true``) + When the DataSink is used to produce an orginized output file outside + of nipypes internal cache structure, a file system hard link will be + attempted first. A hard link allow multiple file paths to point to the + same physical storage location on disk if the conditions allow. By + refering to the same physical file on disk (instead of copying files + byte-by-byte) we can avoid unnecessary data duplication. If hard links + are not supported for the source or destination paths specified, then + a standard byte-by-byte copy is used. (possible values: ``true`` and + ``false``; default value: ``true``) *use_relative_paths* - Should the paths stored in results (and used to look for inputs) - be relative or absolute. Relative paths allow moving the whole - working directory around but may cause problems with - symlinks. (possible values: ``true`` and ``false``; default - value: ``false``) + Should the paths stored in results (and used to look for inputs) + be relative or absolute. Relative paths allow moving the whole + working directory around but may cause problems with + symlinks. (possible values: ``true`` and ``false``; default + value: ``false``) *local_hash_check* Perform the hash check on the job submission machine. This option minimizes @@ -115,10 +119,10 @@ Execution done after a job finish is detected. (float in seconds; default value: 5) *remove_node_directories (EXPERIMENTAL)* - Removes directories whose outputs have already been used - up. Doesn't work with IdentiInterface or any node that patches - data through (without copying) (possible values: ``true`` and - ``false``; default value: ``false``) + Removes directories whose outputs have already been used + up. Doesn't work with IdentiInterface or any node that patches + data through (without copying) (possible values: ``true`` and + ``false``; default value: ``false``) *stop_on_unknown_version* If this is set to True, an underlying interface will raise an error, when no @@ -146,18 +150,27 @@ Execution crashfiles allow portability across machines and shorter load time. (possible values: ``pklz`` and ``txt``; default value: ``pklz``) +*resource_monitor* + Enables monitoring the resources occupation (possible values: ``true`` and + ``false``; default value: ``false``) + +*resource_monitor_frequency* + Sampling period (in seconds) between measurements of resources (memory, cpus) + being used by an interface. Requires ``resource_monitor`` to be ``true``. + (default value: ``1``) + Example ~~~~~~~ :: - [logging] - workflow_level = DEBUG + [logging] + workflow_level = DEBUG - [execution] - stop_on_first_crash = true - hash_method = timestamp - display_variable = :1 + [execution] + stop_on_first_crash = true + hash_method = timestamp + display_variable = :1 Workflow.config property has a form of a nested dictionary reflecting the structure of the .cfg file. diff --git a/doc/users/plugins.rst b/doc/users/plugins.rst index 6c825aa8f8..4c0960c554 100644 --- a/doc/users/plugins.rst +++ b/doc/users/plugins.rst @@ -74,6 +74,14 @@ Optional arguments:: n_procs : Number of processes to launch in parallel, if not set number of processors/threads will be automatically detected + memory_gb : Total memory available to be shared by all simultaneous tasks + currently running, if not set it will be automatically set to 90\% of + system RAM. + + raise_insufficient : Raise exception when the estimated resources of a node + exceed the total amount of resources available (memory and threads), when + ``False`` (default), only a warning will be issued. + To distribute processing on a multicore machine, simply call:: workflow.run(plugin='MultiProc') diff --git a/doc/users/resource_sched_profiler.rst b/doc/users/resource_sched_profiler.rst index 37404b27da..7fa0819c19 100644 --- a/doc/users/resource_sched_profiler.rst +++ b/doc/users/resource_sched_profiler.rst @@ -82,7 +82,7 @@ by setting the ``status_callback`` parameter to point to this function in the :: - from nipype.pipeline.plugins.callback_log import log_nodes_cb + from nipype.utils.profiler import log_nodes_cb args_dict = {'n_procs' : 8, 'memory_gb' : 10, 'status_callback' : log_nodes_cb} To set the filepath for the callback log the ``'callback'`` logger must be @@ -141,7 +141,7 @@ The pandas_ Python package is required to use this feature. :: - from nipype.pipeline.plugins.callback_log import log_nodes_cb + from nipype.utils.profiler import log_nodes_cb args_dict = {'n_procs' : 8, 'memory_gb' : 10, 'status_callback' : log_nodes_cb} workflow.run(plugin='MultiProc', plugin_args=args_dict) diff --git a/docker/files/run_examples.sh b/docker/files/run_examples.sh index a23c27e76b..d8fc17b512 100644 --- a/docker/files/run_examples.sh +++ b/docker/files/run_examples.sh @@ -12,10 +12,18 @@ mkdir -p ${HOME}/.nipype ${WORKDIR}/logs/example_${example_id} ${WORKDIR}/tests echo "[logging]" > ${HOME}/.nipype/nipype.cfg echo "workflow_level = DEBUG" >> ${HOME}/.nipype/nipype.cfg echo "interface_level = DEBUG" >> ${HOME}/.nipype/nipype.cfg -echo "filemanip_level = DEBUG" >> ${HOME}/.nipype/nipype.cfg +echo "utils_level = DEBUG" >> ${HOME}/.nipype/nipype.cfg echo "log_to_file = true" >> ${HOME}/.nipype/nipype.cfg echo "log_directory = ${WORKDIR}/logs/example_${example_id}" >> ${HOME}/.nipype/nipype.cfg +echo '[execution]' >> ${HOME}/.nipype/nipype.cfg +echo 'crashfile_format = txt' >> ${HOME}/.nipype/nipype.cfg + +if [[ "${NIPYPE_RESOURCE_MONITOR:-0}" == "1" ]]; then + echo 'resource_monitor = true' >> ${HOME}/.nipype/nipype.cfg + echo 'resource_monitor_frequency = 3' >> ${HOME}/.nipype/nipype.cfg +fi + # Set up coverage export COVERAGE_FILE=${WORKDIR}/tests/.coverage.${example_id} if [ "$2" == "MultiProc" ]; then @@ -25,8 +33,10 @@ fi coverage run /src/nipype/tools/run_examples.py $@ exit_code=$? +if [[ "${NIPYPE_RESOURCE_MONITOR:-0}" == "1" ]]; then + cp resource_monitor.json 2>/dev/null ${WORKDIR}/logs/example_${example_id}/ || : +fi # Collect crashfiles and generate xml report coverage xml -o ${WORKDIR}/tests/smoketest_${example_id}.xml -find /work -name "crash-*" -maxdepth 1 -exec mv {} ${WORKDIR}/crashfiles/ \; +find /work -maxdepth 1 -name "crash-*" -exec mv {} ${WORKDIR}/crashfiles/ \; exit $exit_code - diff --git a/docker/files/run_pytests.sh b/docker/files/run_pytests.sh index f76734ad45..19b6fcab87 100644 --- a/docker/files/run_pytests.sh +++ b/docker/files/run_pytests.sh @@ -17,10 +17,11 @@ echo '[logging]' > ${HOME}/.nipype/nipype.cfg echo 'log_to_file = true' >> ${HOME}/.nipype/nipype.cfg echo "log_directory = ${WORKDIR}/logs/py${PYTHON_VERSION}" >> ${HOME}/.nipype/nipype.cfg -# Enable profile_runtime tests only for python 2.7 -if [[ "${PYTHON_VERSION}" -lt "30" ]]; then - echo '[execution]' >> ${HOME}/.nipype/nipype.cfg - echo 'profile_runtime = true' >> ${HOME}/.nipype/nipype.cfg +echo '[execution]' >> ${HOME}/.nipype/nipype.cfg +echo 'crashfile_format = txt' >> ${HOME}/.nipype/nipype.cfg + +if [[ "${NIPYPE_RESOURCE_MONITOR:-0}" == "1" ]]; then + echo 'resource_monitor = true' >> ${HOME}/.nipype/nipype.cfg fi # Run tests using pytest @@ -28,17 +29,8 @@ export COVERAGE_FILE=${WORKDIR}/tests/.coverage.py${PYTHON_VERSION} py.test -v --junitxml=${WORKDIR}/tests/pytests_py${PYTHON_VERSION}.xml --cov nipype --cov-config /src/nipype/.coveragerc --cov-report xml:${WORKDIR}/tests/coverage_py${PYTHON_VERSION}.xml ${TESTPATH} exit_code=$? -# Workaround: run here the profiler tests in python 3 -if [[ "${PYTHON_VERSION}" -ge "30" ]]; then - echo '[execution]' >> ${HOME}/.nipype/nipype.cfg - echo 'profile_runtime = true' >> ${HOME}/.nipype/nipype.cfg - export COVERAGE_FILE=${WORKDIR}/tests/.coverage.py${PYTHON_VERSION}_extra - py.test -v --junitxml=${WORKDIR}/tests/pytests_py${PYTHON_VERSION}_extra.xml --cov nipype --cov-report xml:${WORKDIR}/tests/coverage_py${PYTHON_VERSION}_extra.xml /src/nipype/nipype/interfaces/tests/test_runtime_profiler.py /src/nipype/nipype/pipeline/plugins/tests/test_multiproc*.py - exit_code=$(( $exit_code + $? )) -fi - # Collect crashfiles -find ${WORKDIR} -name "crash-*" -maxdepth 1 -exec mv {} ${WORKDIR}/crashfiles/ \; +find ${WORKDIR} -maxdepth 1 -name "crash-*" -exec mv {} ${WORKDIR}/crashfiles/ \; echo "Unit tests finished with exit code ${exit_code}" exit ${exit_code} diff --git a/nipype/info.py b/nipype/info.py index b6a6ac1d3e..d7b1b106d1 100644 --- a/nipype/info.py +++ b/nipype/info.py @@ -159,7 +159,7 @@ def get_nipype_gitversion(): 'doc': ['Sphinx>=1.4', 'matplotlib', 'pydotplus', 'pydot>=1.2.3'], 'tests': TESTS_REQUIRES, 'nipy': ['nitime', 'nilearn', 'dipy', 'nipy', 'matplotlib'], - 'profiler': ['psutil'], + 'profiler': ['psutil>=5.0'], 'duecredit': ['duecredit'], 'xvfbwrapper': ['xvfbwrapper'], 'pybids' : ['pybids'] diff --git a/nipype/interfaces/afni/base.py b/nipype/interfaces/afni/base.py index 3ab5756f03..3405f96cfa 100644 --- a/nipype/interfaces/afni/base.py +++ b/nipype/interfaces/afni/base.py @@ -45,6 +45,7 @@ def version(): """ try: clout = CommandLine(command='afni --version', + resource_monitor=False, terminal_output='allatonce').run() except IOError: # If afni_vcheck is not present, return None @@ -106,6 +107,8 @@ def standard_image(img_name): Could be made more fancy to allow for more relocatability''' clout = CommandLine('which afni', + ignore_exception=True, + resource_monitor=False, terminal_output='allatonce').run() if clout.runtime.returncode is not 0: return None @@ -127,6 +130,8 @@ def _run_interface(self, runtime): class AFNICommandInputSpec(CommandLineInputSpec): + num_threads = traits.Int(1, usedefault=True, nohash=True, + desc='set number of threads') outputtype = traits.Enum('AFNI', list(Info.ftypes.keys()), desc='AFNI output filetype') out_file = File(name_template="%s_afni", desc='output image file name', @@ -138,6 +143,7 @@ class AFNICommandOutputSpec(TraitedSpec): out_file = File(desc='output file', exists=True) + class AFNICommand(AFNICommandBase): """Shared options for several AFNI commands """ input_spec = AFNICommandInputSpec @@ -169,10 +175,36 @@ class AFNICommand(AFNICommandBase): 'tags': ['implementation'], }] + @property + def num_threads(self): + return self.inputs.num_threads + + @num_threads.setter + def num_threads(self, value): + self.inputs.num_threads = value + + @classmethod + def set_default_output_type(cls, outputtype): + """Set the default output type for AFNI classes. + + This method is used to set the default output type for all afni + subclasses. However, setting this will not update the output + type for any existing instances. For these, assign the + .inputs.outputtype. + """ + + if outputtype in Info.ftypes: + cls._outputtype = outputtype + else: + raise AttributeError('Invalid AFNI outputtype: %s' % outputtype) + def __init__(self, **inputs): super(AFNICommand, self).__init__(**inputs) self.inputs.on_trait_change(self._output_update, 'outputtype') + if hasattr(self.inputs, 'num_threads'): + self.inputs.on_trait_change(self._nthreads_update, 'num_threads') + if self._outputtype is None: self._outputtype = Info.outputtype() @@ -181,11 +213,9 @@ def __init__(self, **inputs): else: self._output_update() - def _run_interface(self, runtime): - # Update num threads estimate from OMP_NUM_THREADS env var - # Default to 1 if not set - self.inputs.environ['OMP_NUM_THREADS'] = str(self.num_threads) - return super(AFNICommand, self)._run_interface(runtime) + def _nthreads_update(self): + """Update environment with new number of threads""" + self.inputs.environ['OMP_NUM_THREADS'] = '%d' % self.inputs.num_threads def _output_update(self): """ i think? updates class private attribute based on instance input @@ -194,21 +224,6 @@ def _output_update(self): """ self._outputtype = self.inputs.outputtype - @classmethod - def set_default_output_type(cls, outputtype): - """Set the default output type for AFNI classes. - - This method is used to set the default output type for all afni - subclasses. However, setting this will not update the output - type for any existing instances. For these, assign the - .inputs.outputtype. - """ - - if outputtype in Info.ftypes: - cls._outputtype = outputtype - else: - raise AttributeError('Invalid AFNI outputtype: %s' % outputtype) - def _overload_extension(self, value, name=None): path, base, _ = split_filename(value) return os.path.join(path, base + Info.output_type_to_ext(self.inputs.outputtype)) @@ -271,6 +286,7 @@ def _gen_fname(self, basename, cwd=None, suffix=None, change_ext=True, use_ext=False, newpath=cwd) return fname + def no_afni(): """ Checks if AFNI is available """ if Info.version() is None: @@ -282,8 +298,9 @@ class AFNIPythonCommandInputSpec(CommandLineInputSpec): outputtype = traits.Enum('AFNI', list(Info.ftypes.keys()), desc='AFNI output filetype') py27_path = traits.Either('python2', File(exists=True), - usedefault=True, - default='python2') + usedefault=True, + default='python2') + class AFNIPythonCommand(AFNICommand): @property @@ -295,4 +312,4 @@ def cmd(self): @property def cmdline(self): - return "{} {}".format(self.inputs.py27_path, super(AFNIPythonCommand, self).cmdline) \ No newline at end of file + return "{} {}".format(self.inputs.py27_path, super(AFNIPythonCommand, self).cmdline) diff --git a/nipype/interfaces/base.py b/nipype/interfaces/base.py index 19cf9ccaa6..93ea49eeaa 100644 --- a/nipype/interfaces/base.py +++ b/nipype/interfaces/base.py @@ -24,7 +24,7 @@ import platform from string import Template import select -import subprocess +import subprocess as sp import sys import time from textwrap import wrap @@ -44,30 +44,22 @@ File, Directory, DictStrStr, has_metadata, ImageFile) from ..external.due import due -runtime_profile = str2bool(config.get('execution', 'profile_runtime')) nipype_version = Version(__version__) iflogger = logging.getLogger('interface') FLOAT_FORMAT = '{:.10f}'.format PY35 = sys.version_info >= (3, 5) PY3 = sys.version_info[0] > 2 - -if runtime_profile: - try: - import psutil - except ImportError as exc: - iflogger.info('Unable to import packages needed for runtime profiling. '\ - 'Turning off runtime profiler. Reason: %s' % exc) - runtime_profile = False - __docformat__ = 'restructuredtext' class Str(traits.Unicode): pass + traits.Str = Str + class NipypeInterfaceError(Exception): def __init__(self, value): self.value = value @@ -75,6 +67,7 @@ def __init__(self, value): def __str__(self): return '{}'.format(self.value) + def _exists_in_path(cmd, environ): """ Based on a code snippet from @@ -108,7 +101,6 @@ def load_template(name): template : string.Template """ - full_fname = os.path.join(os.path.dirname(__file__), 'script_templates', name) template_file = open(full_fname) @@ -144,7 +136,6 @@ class Bunch(object): """ - def __init__(self, *args, **kwargs): self.__dict__.update(*args, **kwargs) @@ -581,7 +572,6 @@ def get_hashval(self, hash_method=None): hash_files=hash_files))) return dict_withhash, md5(to_str(dict_nofilename).encode()).hexdigest() - def _get_sorteddict(self, objekt, dictwithhash=False, hash_method=None, hash_files=True): if isinstance(objekt, dict): @@ -738,9 +728,9 @@ def _get_filecopy_info(self): class BaseInterfaceInputSpec(TraitedSpec): - ignore_exception = traits.Bool(False, desc="Print an error message instead \ -of throwing an exception in case the interface fails to run", usedefault=True, - nohash=True) + ignore_exception = traits.Bool(False, usedefault=True, nohash=True, + desc='Print an error message instead of throwing an exception ' + 'in case the interface fails to run') class BaseInterface(Interface): @@ -760,21 +750,36 @@ class BaseInterface(Interface): This class cannot be instantiated. + + Relevant Interface attributes + ----------------------------- + + ``input_spec`` points to the traited class for the inputs + ``output_spec`` points to the traited class for the outputs + ``_redirect_x`` should be set to ``True`` when the interface requires + connecting to a ``$DISPLAY`` (default is ``False``). + ``resource_monitor`` if ``False`` prevents resource-monitoring this + interface, if ``True`` monitoring will be enabled IFF the general + Nipype config is set on (``resource_monitor = true``). + + """ input_spec = BaseInterfaceInputSpec _version = None _additional_metadata = [] _redirect_x = False references_ = [] + resource_monitor = True # Enabled for this interface IFF enabled in the config - def __init__(self, from_file=None, **inputs): + def __init__(self, from_file=None, resource_monitor=None, **inputs): if not self.input_spec: raise Exception('No input_spec in class: %s' % self.__class__.__name__) self.inputs = self.input_spec(**inputs) - self.estimated_memory_gb = 0.25 - self.num_threads = 1 + + if resource_monitor is not None: + self.resource_monitor = resource_monitor if from_file is not None: self.load_inputs_from_json(from_file, overwrite=True) @@ -782,7 +787,6 @@ def __init__(self, from_file=None, **inputs): for name, value in list(inputs.items()): setattr(self.inputs, name, value) - @classmethod def help(cls, returnhelp=False): """ Prints class help @@ -903,7 +907,7 @@ def _outputs_help(cls): """ helpstr = ['Outputs::', ''] if cls.output_spec: - outputs = cls.output_spec() #pylint: disable=E1102 + outputs = cls.output_spec() # pylint: disable=E1102 for name, spec in sorted(outputs.traits(transient=None).items()): helpstr += cls._get_trait_desc(outputs, name, spec) if len(helpstr) == 2: @@ -915,7 +919,7 @@ def _outputs(self): """ outputs = None if self.output_spec: - outputs = self.output_spec() #pylint: disable=E1102 + outputs = self.output_spec() # pylint: disable=E1102 return outputs @@ -1011,7 +1015,6 @@ def _check_version_requirements(self, trait_object, raise_exception=True): return unavailable_traits def _run_wrapper(self, runtime): - sysdisplay = os.getenv('DISPLAY') if self._redirect_x: try: from xvfbwrapper import Xvfb @@ -1063,6 +1066,10 @@ def run(self, **inputs): results : an InterfaceResult object containing a copy of the instance that was executed, provenance information and, if successful, results """ + from ..utils.profiler import ResourceMonitor + + enable_rm = config.resource_monitor and self.resource_monitor + force_raise = not getattr(self.inputs, 'ignore_exception', False) self.inputs.trait_set(**inputs) self._check_mandatory_inputs() self._check_version_requirements(self.inputs) @@ -1070,6 +1077,8 @@ def run(self, **inputs): self._duecredit_cite() # initialize provenance tracking + store_provenance = str2bool(config.get( + 'execution', 'write_provenance', 'false')) env = deepcopy(dict(os.environ)) runtime = Bunch(cwd=os.getcwd(), returncode=None, @@ -1080,66 +1089,72 @@ def run(self, **inputs): platform=platform.platform(), hostname=platform.node(), version=self.version) + + mon_sp = None + if enable_rm: + mon_freq = float(config.get('execution', 'resource_monitor_frequency', 1)) + proc_pid = os.getpid() + iflogger.debug('Creating a ResourceMonitor on a %s interface, PID=%d.', + self.__class__.__name__, proc_pid) + mon_sp = ResourceMonitor(proc_pid, freq=mon_freq) + mon_sp.start() + + # Grab inputs now, as they should not change during execution + inputs = self.inputs.get_traitsfree() + outputs = None try: runtime = self._run_wrapper(runtime) outputs = self.aggregate_outputs(runtime) - runtime.endTime = dt.isoformat(dt.utcnow()) - timediff = parseutc(runtime.endTime) - parseutc(runtime.startTime) - runtime.duration = (timediff.days * 86400 + timediff.seconds + - timediff.microseconds / 100000.) - results = InterfaceResult(interface, runtime, - inputs=self.inputs.get_traitsfree(), - outputs=outputs) - prov_record = None - if str2bool(config.get('execution', 'write_provenance')): - prov_record = write_provenance(results) - results.provenance = prov_record except Exception as e: + import traceback + # Retrieve the maximum info fast + runtime.traceback = traceback.format_exc() + # Gather up the exception arguments and append nipype info. + exc_args = e.args if getattr(e, 'args') else tuple() + exc_args += ('An exception of type %s occurred while running interface %s.' % + (type(e).__name__, self.__class__.__name__), ) + if config.get('logging', 'interface_level', 'info').lower() == 'debug': + exc_args += ('Inputs: %s' % str(self.inputs),) + + runtime.traceback_args = ('\n'.join(['%s' % arg for arg in exc_args]),) + + if force_raise: + raise + finally: + # This needs to be done always runtime.endTime = dt.isoformat(dt.utcnow()) timediff = parseutc(runtime.endTime) - parseutc(runtime.startTime) runtime.duration = (timediff.days * 86400 + timediff.seconds + - timediff.microseconds / 100000.) - if len(e.args) == 0: - e.args = ("") - - message = "\nInterface %s failed to run." % self.__class__.__name__ - - if config.has_option('logging', 'interface_level') and \ - config.get('logging', 'interface_level').lower() == 'debug': - inputs_str = "\nInputs:" + str(self.inputs) + "\n" - else: - inputs_str = '' - - if len(e.args) == 1 and isinstance(e.args[0], (str, bytes)): - e.args = (e.args[0] + " ".join([message, inputs_str]),) - else: - e.args += (message, ) - if inputs_str != '': - e.args += (inputs_str, ) + timediff.microseconds / 1e6) + results = InterfaceResult(interface, runtime, inputs=inputs, outputs=outputs, + provenance=None) + + # Add provenance (if required) + if store_provenance: + # Provenance will only throw a warning if something went wrong + results.provenance = write_provenance(results) + + # Make sure runtime profiler is shut down + if enable_rm: + import numpy as np + mon_sp.stop() + + runtime.mem_peak_gb = None + runtime.cpu_percent = None + + # Read .prof file in and set runtime values + vals = np.loadtxt(mon_sp.fname, delimiter=',') + if vals.size: + vals = np.atleast_2d(vals) + runtime.mem_peak_gb = vals[:, 1].max() / 1024 + runtime.cpu_percent = vals[:, 2].max() + + runtime.prof_dict = { + 'time': vals[:, 0].tolist(), + 'mem_gb': (vals[:, 1] / 1024).tolist(), + 'cpus': vals[:, 2].tolist(), + } - # exception raising inhibition for special cases - import traceback - runtime.traceback = traceback.format_exc() - runtime.traceback_args = e.args - inputs = None - try: - inputs = self.inputs.get_traitsfree() - except Exception as e: - pass - results = InterfaceResult(interface, runtime, inputs=inputs) - prov_record = None - if str2bool(config.get('execution', 'write_provenance')): - try: - prov_record = write_provenance(results) - except Exception: - prov_record = None - results.provenance = prov_record - if hasattr(self.inputs, 'ignore_exception') and \ - isdefined(self.inputs.ignore_exception) and \ - self.inputs.ignore_exception: - pass - else: - raise return results def _list_outputs(self): @@ -1153,6 +1168,7 @@ def _list_outputs(self): def aggregate_outputs(self, runtime=None, needed_outputs=None): """ Collate expected outputs and check for existence """ + predicted_outputs = self._list_outputs() outputs = self._outputs() if predicted_outputs: @@ -1170,15 +1186,13 @@ def aggregate_outputs(self, runtime=None, needed_outputs=None): self.__class__.__name__)) try: setattr(outputs, key, val) - _ = getattr(outputs, key) except TraitError as error: - if hasattr(error, 'info') and \ - error.info.startswith("an existing"): + if getattr(error, 'info', 'default').startswith('an existing'): msg = ("File/Directory '%s' not found for %s output " "'%s'." % (val, self.__class__.__name__, key)) raise FileNotFoundError(msg) - else: - raise error + raise error + return outputs @property @@ -1270,118 +1284,6 @@ def _read(self, drain): self._lastidx = len(self._rows) -# Get number of threads for process -def _get_num_threads(proc): - """Function to get the number of threads a process is using - NOTE: If - - Parameters - ---------- - proc : psutil.Process instance - the process to evaluate thead usage of - - Returns - ------- - num_threads : int - the number of threads that the process is using - """ - - # Import packages - import psutil - - # If process is running - if proc.status() == psutil.STATUS_RUNNING: - num_threads = proc.num_threads() - elif proc.num_threads() > 1: - tprocs = [psutil.Process(thr.id) for thr in proc.threads()] - alive_tprocs = [tproc for tproc in tprocs if tproc.status() == psutil.STATUS_RUNNING] - num_threads = len(alive_tprocs) - else: - num_threads = 1 - - # Try-block for errors - try: - child_threads = 0 - # Iterate through child processes and get number of their threads - for child in proc.children(recursive=True): - # Leaf process - if len(child.children()) == 0: - # If process is running, get its number of threads - if child.status() == psutil.STATUS_RUNNING: - child_thr = child.num_threads() - # If its not necessarily running, but still multi-threaded - elif child.num_threads() > 1: - # Cast each thread as a process and check for only running - tprocs = [psutil.Process(thr.id) for thr in child.threads()] - alive_tprocs = [tproc for tproc in tprocs if tproc.status() == psutil.STATUS_RUNNING] - child_thr = len(alive_tprocs) - # Otherwise, no threads are running - else: - child_thr = 0 - # Increment child threads - child_threads += child_thr - # Catch any NoSuchProcess errors - except psutil.NoSuchProcess: - pass - - # Number of threads is max between found active children and parent - num_threads = max(child_threads, num_threads) - - # Return number of threads found - return num_threads - - -# Get ram usage of process -def _get_ram_mb(pid, pyfunc=False): - """Function to get the RAM usage of a process and its children - - Parameters - ---------- - pid : integer - the PID of the process to get RAM usage of - pyfunc : boolean (optional); default=False - a flag to indicate if the process is a python function; - when Pythons are multithreaded via multiprocess or threading, - children functions include their own memory + parents. if this - is set, the parent memory will removed from children memories - - Reference: http://ftp.dev411.com/t/python/python-list/095thexx8g/multiprocessing-forking-memory-usage - - Returns - ------- - mem_mb : float - the memory RAM in MB utilized by the process PID - """ - - # Import packages - import psutil - - # Init variables - _MB = 1024.0**2 - - # Try block to protect against any dying processes in the interim - try: - # Init parent - parent = psutil.Process(pid) - # Get memory of parent - parent_mem = parent.memory_info().rss - mem_mb = parent_mem/_MB - - # Iterate through child processes - for child in parent.children(recursive=True): - child_mem = child.memory_info().rss - if pyfunc: - child_mem -= parent_mem - mem_mb += child_mem/_MB - - # Catch if process dies, return gracefully - except psutil.NoSuchProcess: - pass - - # Return memory - return mem_mb - - def _canonicalize_env(env): """Windows requires that environment be dicts with bytes as keys and values This function converts any unicode entries for Windows only, returning the @@ -1411,52 +1313,13 @@ def _canonicalize_env(env): return out_env -# Get max resources used for process -def get_max_resources_used(pid, mem_mb, num_threads, pyfunc=False): - """Function to get the RAM and threads usage of a process - - Parameters - ---------- - pid : integer - the process ID of process to profile - mem_mb : float - the high memory watermark so far during process execution (in MB) - num_threads: int - the high thread watermark so far during process execution - - Returns - ------- - mem_mb : float - the new high memory watermark of process (MB) - num_threads : float - the new high thread watermark of process - """ - - # Import packages - import psutil - - try: - mem_mb = max(mem_mb, _get_ram_mb(pid, pyfunc=pyfunc)) - num_threads = max(num_threads, _get_num_threads(psutil.Process(pid))) - except Exception as exc: - iflogger.info('Could not get resources used by process. Error: %s'\ - % exc) - - # Return resources - return mem_mb, num_threads - - def run_command(runtime, output=None, timeout=0.01, redirect_x=False): """Run a command, read stdout and stderr, prefix with timestamp. The returned runtime contains a merged stdout+stderr log with timestamps """ - # Init logger - logger = logging.getLogger('workflow') - # Init variables - PIPE = subprocess.PIPE cmdline = runtime.cmdline if redirect_x: @@ -1476,28 +1339,23 @@ def run_command(runtime, output=None, timeout=0.01, redirect_x=False): stderr = open(errfile, 'wb') # t=='text'===default stdout = open(outfile, 'wb') - proc = subprocess.Popen(cmdline, - stdout=stdout, - stderr=stderr, - shell=True, - cwd=runtime.cwd, - env=env) + proc = sp.Popen(cmdline, + stdout=stdout, + stderr=stderr, + shell=True, + cwd=runtime.cwd, + env=env) else: - proc = subprocess.Popen(cmdline, - stdout=PIPE, - stderr=PIPE, - shell=True, - cwd=runtime.cwd, - env=env) + proc = sp.Popen(cmdline, + stdout=sp.PIPE, + stderr=sp.PIPE, + shell=True, + cwd=runtime.cwd, + env=env) result = {} errfile = os.path.join(runtime.cwd, 'stderr.nipype') outfile = os.path.join(runtime.cwd, 'stdout.nipype') - # Init variables for memory profiling - mem_mb = 0 - num_threads = 1 - interval = .5 - if output == 'stream': streams = [Stream('stdout', proc.stdout), Stream('stderr', proc.stderr)] @@ -1513,13 +1371,11 @@ def _process(drain=0): else: for stream in res[0]: stream.read(drain) + while proc.returncode is None: - if runtime_profile: - mem_mb, num_threads = \ - get_max_resources_used(proc.pid, mem_mb, num_threads) proc.poll() _process() - time.sleep(interval) + _process(drain=1) # collect results, merge and return @@ -1533,12 +1389,6 @@ def _process(drain=0): result['merged'] = [r[1] for r in temp] if output == 'allatonce': - if runtime_profile: - while proc.returncode is None: - mem_mb, num_threads = \ - get_max_resources_used(proc.pid, mem_mb, num_threads) - proc.poll() - time.sleep(interval) stdout, stderr = proc.communicate() stdout = stdout.decode(default_encoding) stderr = stderr.decode(default_encoding) @@ -1546,32 +1396,20 @@ def _process(drain=0): result['stderr'] = stderr.split('\n') result['merged'] = '' if output == 'file': - if runtime_profile: - while proc.returncode is None: - mem_mb, num_threads = \ - get_max_resources_used(proc.pid, mem_mb, num_threads) - proc.poll() - time.sleep(interval) - ret_code = proc.wait() + proc.wait() stderr.flush() stdout.flush() - result['stdout'] = [line.decode(default_encoding).strip() for line in open(outfile, 'rb').readlines()] - result['stderr'] = [line.decode(default_encoding).strip() for line in open(errfile, 'rb').readlines()] + result['stdout'] = [line.decode(default_encoding).strip() + for line in open(outfile, 'rb').readlines()] + result['stderr'] = [line.decode(default_encoding).strip() + for line in open(errfile, 'rb').readlines()] result['merged'] = '' if output == 'none': - if runtime_profile: - while proc.returncode is None: - mem_mb, num_threads = \ - get_max_resources_used(proc.pid, mem_mb, num_threads) - proc.poll() - time.sleep(interval) proc.communicate() result['stdout'] = [] result['stderr'] = [] result['merged'] = '' - setattr(runtime, 'runtime_memory_gb', mem_mb/1024.0) - setattr(runtime, 'runtime_threads', num_threads) runtime.stderr = '\n'.join(result['stderr']) runtime.stdout = '\n'.join(result['stdout']) runtime.merged = result['merged'] @@ -1585,19 +1423,18 @@ def get_dependencies(name, environ): Uses otool on darwin, ldd on linux. Currently doesn't support windows. """ - PIPE = subprocess.PIPE if sys.platform == 'darwin': - proc = subprocess.Popen('otool -L `which %s`' % name, - stdout=PIPE, - stderr=PIPE, - shell=True, - env=environ) + proc = sp.Popen('otool -L `which %s`' % name, + stdout=sp.PIPE, + stderr=sp.PIPE, + shell=True, + env=environ) elif 'linux' in sys.platform: - proc = subprocess.Popen('ldd `which %s`' % name, - stdout=PIPE, - stderr=PIPE, - shell=True, - env=environ) + proc = sp.Popen('ldd `which %s`' % name, + stdout=sp.PIPE, + stderr=sp.PIPE, + shell=True, + env=environ) else: return 'Platform %s not supported' % sys.platform o, e = proc.communicate() @@ -1607,7 +1444,7 @@ def get_dependencies(name, environ): class CommandLineInputSpec(BaseInterfaceInputSpec): args = Str(argstr='%s', desc='Additional parameters to the command') environ = DictStrStr(desc='Environment variables', usedefault=True, - nohash=True) + nohash=True) # This input does not have a "usedefault=True" so the set_default_terminal_output() # method would work terminal_output = traits.Enum('stream', 'allatonce', 'file', 'none', @@ -1745,12 +1582,12 @@ def version_from_command(self, flag='-v'): if _exists_in_path(cmdname, env): out_environ = self._get_environ() env.update(out_environ) - proc = subprocess.Popen(' '.join((cmdname, flag)), - shell=True, - env=env, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) + proc = sp.Popen(' '.join((cmdname, flag)), + shell=True, + env=env, + stdout=sp.PIPE, + stderr=sp.PIPE, + ) o, e = proc.communicate() return o @@ -1905,7 +1742,7 @@ def _list_outputs(self): metadata = dict(name_source=lambda t: t is not None) traits = self.inputs.traits(**metadata) if traits: - outputs = self.output_spec().get() #pylint: disable=E1102 + outputs = self.output_spec().get() # pylint: disable=E1102 for name, trait_spec in list(traits.items()): out_name = name if trait_spec.output_name is not None: @@ -1953,8 +1790,8 @@ def _parse_inputs(self, skip=None): final_args[pos] = arg else: all_args.append(arg) - first_args = [arg for pos, arg in sorted(initial_args.items())] - last_args = [arg for pos, arg in sorted(final_args.items())] + first_args = [el for _, el in sorted(initial_args.items())] + last_args = [el for _, el in sorted(final_args.items())] return first_args + all_args + last_args @@ -2025,7 +1862,7 @@ class SEMLikeCommandLine(CommandLine): """ def _list_outputs(self): - outputs = self.output_spec().get() #pylint: disable=E1102 + outputs = self.output_spec().get() # pylint: disable=E1102 return self._outputs_from_inputs(outputs) def _outputs_from_inputs(self, outputs): @@ -2062,7 +1899,7 @@ def validate(self, object, name, value): # want to treat range and other sequences (except str) as list if not isinstance(value, (str, bytes)) and isinstance(value, collections.Sequence): - value = list(value) + value = list(value) if not isdefined(value) or \ (isinstance(value, list) and len(value) == 0): diff --git a/nipype/interfaces/dynamic_slicer.py b/nipype/interfaces/dynamic_slicer.py index d38f4171f3..4d1df1e136 100644 --- a/nipype/interfaces/dynamic_slicer.py +++ b/nipype/interfaces/dynamic_slicer.py @@ -25,7 +25,8 @@ class SlicerCommandLine(CommandLine): output_spec = DynamicTraitedSpec def _grab_xml(self, module): - cmd = CommandLine(command="Slicer3", args="--launch %s --xml" % module) + cmd = CommandLine(command="Slicer3", resource_monitor=False, + args="--launch %s --xml" % module) ret = cmd.run() if ret.runtime.returncode == 0: return xml.dom.minidom.parseString(ret.runtime.stdout) diff --git a/nipype/interfaces/fsl/tests/test_preprocess.py b/nipype/interfaces/fsl/tests/test_preprocess.py index 7d3d6a9dce..32f0266ddb 100644 --- a/nipype/interfaces/fsl/tests/test_preprocess.py +++ b/nipype/interfaces/fsl/tests/test_preprocess.py @@ -258,7 +258,8 @@ def test_flirt(setup_flirt): if key in ('trait_added', 'trait_modified', 'in_file', 'reference', 'environ', 'output_type', 'out_file', 'out_matrix_file', 'in_matrix_file', 'apply_xfm', 'ignore_exception', - 'terminal_output', 'out_log', 'save_log'): + 'resource_monitor', 'terminal_output', 'out_log', + 'save_log'): continue param = None value = None diff --git a/nipype/interfaces/matlab.py b/nipype/interfaces/matlab.py index d3f6f26993..b56ef3ce17 100644 --- a/nipype/interfaces/matlab.py +++ b/nipype/interfaces/matlab.py @@ -22,6 +22,7 @@ def get_matlab_command(): try: res = CommandLine(command='which', args=matlab_cmd, + resource_monitor=False, terminal_output='allatonce').run() matlab_path = res.runtime.stdout.strip() except Exception as e: diff --git a/nipype/interfaces/spm/base.py b/nipype/interfaces/spm/base.py index 6c3fbab32e..391528e83b 100644 --- a/nipype/interfaces/spm/base.py +++ b/nipype/interfaces/spm/base.py @@ -151,19 +151,14 @@ def version(matlab_cmd=None, paths=None, use_mcr=None): returns None of path not found """ - if use_mcr or 'FORCE_SPMMCR' in os.environ: - use_mcr = True - if matlab_cmd is None: - try: - matlab_cmd = os.environ['SPMMCRCMD'] - except KeyError: - pass - if matlab_cmd is None: - try: - matlab_cmd = os.environ['MATLABCMD'] - except KeyError: - matlab_cmd = 'matlab -nodesktop -nosplash' - mlab = MatlabCommand(matlab_cmd=matlab_cmd) + + use_mcr = use_mcr or 'FORCE_SPMMCR' in os.environ + matlab_cmd = ((use_mcr and os.getenv('SPMMCRCMD')) or + os.getenv('MATLABCMD') or + 'matlab -nodesktop -nosplash') + + mlab = MatlabCommand(matlab_cmd=matlab_cmd, + resource_monitor=False) mlab.inputs.mfile = False if paths: mlab.inputs.paths = paths @@ -187,7 +182,7 @@ def version(matlab_cmd=None, paths=None, use_mcr=None): except (IOError, RuntimeError) as e: # if no Matlab at all -- exception could be raised # No Matlab -- no spm - logger.debug(str(e)) + logger.debug('%s', e) return None else: out = sd._strip_header(out.runtime.stdout) @@ -276,11 +271,12 @@ def _find_mlab_cmd_defaults(self): def _matlab_cmd_update(self): # MatlabCommand has to be created here, - # because matlab_cmb is not a proper input + # because matlab_cmd is not a proper input # and can be set only during init self.mlab = MatlabCommand(matlab_cmd=self.inputs.matlab_cmd, mfile=self.inputs.mfile, - paths=self.inputs.paths) + paths=self.inputs.paths, + resource_monitor=False) self.mlab.inputs.script_file = 'pyscript_%s.m' % \ self.__class__.__name__.split('.')[-1].lower() if isdefined(self.inputs.use_mcr) and self.inputs.use_mcr: diff --git a/nipype/interfaces/tests/test_resource_monitor.py b/nipype/interfaces/tests/test_resource_monitor.py new file mode 100644 index 0000000000..660f11455e --- /dev/null +++ b/nipype/interfaces/tests/test_resource_monitor.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- +# vi: set ft=python sts=4 ts=4 sw=4 et: +""" +Module to unit test the resource_monitor in nipype +""" + +from __future__ import print_function, division, unicode_literals, absolute_import +import os +import pytest + +# Import packages +from nipype import config +from nipype.utils.profiler import _use_resources +from nipype.interfaces.base import traits, CommandLine, CommandLineInputSpec +from nipype.interfaces import utility as niu + +# Try to enable the resource monitor +config.enable_resource_monitor() +run_profile = config.resource_monitor + + +class UseResourcesInputSpec(CommandLineInputSpec): + mem_gb = traits.Float(desc='Number of GB of RAM to use', + argstr='-g %f', mandatory=True) + n_procs = traits.Int(desc='Number of threads to use', + argstr='-p %d', mandatory=True) + + +class UseResources(CommandLine): + """ + use_resources cmd interface + """ + from nipype import __path__ + # Init attributes + input_spec = UseResourcesInputSpec + + # Get path of executable + exec_dir = os.path.realpath(__path__[0]) + exec_path = os.path.join(exec_dir, 'utils', 'tests', 'use_resources') + + # Init cmd + _cmd = exec_path + _always_run = True + + +@pytest.mark.skipif(os.getenv('CI_SKIP_TEST', False), reason='disabled in CI tests') +@pytest.mark.parametrize("mem_gb,n_procs", [(0.5, 3), (2.2, 8), (0.8, 4), (1.5, 1)]) +def test_cmdline_profiling(tmpdir, mem_gb, n_procs): + """ + Test runtime profiler correctly records workflow RAM/CPUs consumption + of a CommandLine-derived interface + """ + from nipype import config + config.set('execution', 'resource_monitor_frequency', '0.2') # Force sampling fast + + tmpdir.chdir() + iface = UseResources(mem_gb=mem_gb, n_procs=n_procs) + result = iface.run() + + assert abs(mem_gb - result.runtime.mem_peak_gb) < 0.3, 'estimated memory error above .3GB' + assert int(result.runtime.cpu_percent / 100 + 0.2) == n_procs, 'wrong number of threads estimated' + + +@pytest.mark.skipif(True, reason='test disabled temporarily, until funcion profiling works') +@pytest.mark.parametrize("mem_gb,n_procs", [(0.5, 3), (2.2, 8), (0.8, 4), (1.5, 1)]) +def test_function_profiling(tmpdir, mem_gb, n_procs): + """ + Test runtime profiler correctly records workflow RAM/CPUs consumption + of a Function interface + """ + from nipype import config + config.set('execution', 'resource_monitor_frequency', '0.2') # Force sampling fast + + tmpdir.chdir() + iface = niu.Function(function=_use_resources) + iface.inputs.mem_gb = mem_gb + iface.inputs.n_procs = n_procs + result = iface.run() + + assert abs(mem_gb - result.runtime.mem_peak_gb) < 0.3, 'estimated memory error above .3GB' + assert int(result.runtime.cpu_percent / 100 + 0.2) >= n_procs diff --git a/nipype/interfaces/tests/test_runtime_profiler.py b/nipype/interfaces/tests/test_runtime_profiler.py deleted file mode 100644 index 400b2728ae..0000000000 --- a/nipype/interfaces/tests/test_runtime_profiler.py +++ /dev/null @@ -1,448 +0,0 @@ -# -*- coding: utf-8 -*- -# test_runtime_profiler.py -# -# Author: Daniel Clark, 2016 - -""" -Module to unit test the runtime_profiler in nipype -""" - -from __future__ import print_function, division, unicode_literals, absolute_import -from builtins import open, str - -# Import packages -from nipype.interfaces.base import (traits, CommandLine, CommandLineInputSpec, - runtime_profile) -import pytest -import sys - -run_profile = runtime_profile - -if run_profile: - try: - import psutil - skip_profile_msg = 'Run profiler tests' - except ImportError as exc: - skip_profile_msg = 'Missing python packages for runtime profiling, skipping...\n'\ - 'Error: %s' % exc - run_profile = False -else: - skip_profile_msg = 'Not running profiler' - -# UseResources inputspec -class UseResourcesInputSpec(CommandLineInputSpec): - ''' - use_resources cmd interface inputspec - ''' - - # Init attributes - num_gb = traits.Float(desc='Number of GB of RAM to use', - argstr='-g %f') - num_threads = traits.Int(desc='Number of threads to use', - argstr='-p %d') - - -# UseResources interface -class UseResources(CommandLine): - ''' - use_resources cmd interface - ''' - - # Import packages - import os - - # Init attributes - input_spec = UseResourcesInputSpec - - # Get path of executable - exec_dir = os.path.dirname(os.path.realpath(__file__)) - exec_path = os.path.join(exec_dir, 'use_resources') - - # Init cmd - _cmd = exec_path - - -# Spin multiple threads -def use_resources(num_threads, num_gb): - ''' - Function to execute multiple use_gb_ram functions in parallel - ''' - - # Function to occupy GB of memory - def _use_gb_ram(num_gb): - ''' - Function to consume GB of memory - ''' - import sys - - # Getsize of one character string - bsize = sys.getsizeof(' ') - sys.getsizeof(' ') - boffset = sys.getsizeof('') - - num_bytes = int(num_gb * (1024**3)) - # Eat num_gb GB of memory for 1 second - gb_str = ' ' * ((num_bytes - boffset) // bsize) - - assert sys.getsizeof(gb_str) == num_bytes - - # Spin CPU - ctr = 0 - while ctr < 30e6: - ctr += 1 - - # Clear memory - del ctr - del gb_str - - # Import packages - from multiprocessing import Process - from threading import Thread - - # Init variables - num_gb = float(num_gb) - - # Build thread list - thread_list = [] - for idx in range(num_threads): - thread = Thread(target=_use_gb_ram, args=(num_gb/num_threads,), - name=str(idx)) - thread_list.append(thread) - - # Run multi-threaded - print('Using %.3f GB of memory over %d sub-threads...' % \ - (num_gb, num_threads)) - for idx, thread in enumerate(thread_list): - thread.start() - - for thread in thread_list: - thread.join() - - -# Test case for the run function -class TestRuntimeProfiler(): - ''' - This class is a test case for the runtime profiler - ''' - - # setup method for the necessary arguments to run cpac_pipeline.run - def setup_class(self): - ''' - Method to instantiate TestRuntimeProfiler - - Parameters - ---------- - self : TestRuntimeProfile - ''' - - # Init parameters - # Input RAM GB to occupy - self.num_gb = 1.0 - # Input number of sub-threads (not including parent threads) - self.num_threads = 2 - # Acceptable percent error for memory profiled against input - self.mem_err_gb = 0.3 # Increased to 30% for py2.7 - - # ! Only used for benchmarking the profiler over a range of - # ! RAM usage and number of threads - # ! Requires a LOT of RAM to be tested - def _collect_range_runtime_stats(self, num_threads): - ''' - Function to collect a range of runtime stats - ''' - - # Import packages - import json - import numpy as np - import pandas as pd - - # Init variables - ram_gb_range = 10.0 - ram_gb_step = 0.25 - dict_list = [] - - # Iterate through all combos - for num_gb in np.arange(0.25, ram_gb_range+ram_gb_step, ram_gb_step): - # Cmd-level - cmd_start_str, cmd_fin_str = self._run_cmdline_workflow(num_gb, num_threads) - cmd_start_ts = json.loads(cmd_start_str)['start'] - cmd_node_stats = json.loads(cmd_fin_str) - cmd_runtime_threads = int(cmd_node_stats['runtime_threads']) - cmd_runtime_gb = float(cmd_node_stats['runtime_memory_gb']) - cmd_finish_ts = cmd_node_stats['finish'] - - # Func-level - func_start_str, func_fin_str = self._run_function_workflow(num_gb, num_threads) - func_start_ts = json.loads(func_start_str)['start'] - func_node_stats = json.loads(func_fin_str) - func_runtime_threads = int(func_node_stats['runtime_threads']) - func_runtime_gb = float(func_node_stats['runtime_memory_gb']) - func_finish_ts = func_node_stats['finish'] - - # Calc errors - cmd_threads_err = cmd_runtime_threads - num_threads - cmd_gb_err = cmd_runtime_gb - num_gb - func_threads_err = func_runtime_threads - num_threads - func_gb_err = func_runtime_gb - num_gb - - # Node dictionary - results_dict = {'input_threads' : num_threads, - 'input_gb' : num_gb, - 'cmd_runtime_threads' : cmd_runtime_threads, - 'cmd_runtime_gb' : cmd_runtime_gb, - 'func_runtime_threads' : func_runtime_threads, - 'func_runtime_gb' : func_runtime_gb, - 'cmd_threads_err' : cmd_threads_err, - 'cmd_gb_err' : cmd_gb_err, - 'func_threads_err' : func_threads_err, - 'func_gb_err' : func_gb_err, - 'cmd_start_ts' : cmd_start_ts, - 'cmd_finish_ts' : cmd_finish_ts, - 'func_start_ts' : func_start_ts, - 'func_finish_ts' : func_finish_ts} - # Append to list - dict_list.append(results_dict) - - # Create dataframe - runtime_results_df = pd.DataFrame(dict_list) - - # Return dataframe - return runtime_results_df - - # Test node - def _run_cmdline_workflow(self, num_gb, num_threads): - ''' - Function to run the use_resources cmdline script in a nipype workflow - and return the runtime stats recorded by the profiler - - Parameters - ---------- - self : TestRuntimeProfile - - Returns - ------- - finish_str : string - a json-compatible dictionary string containing the runtime - statistics of the nipype node that used system resources - ''' - - # Import packages - import logging - import os - import shutil - import tempfile - - import nipype.pipeline.engine as pe - import nipype.interfaces.utility as util - from nipype.pipeline.plugins.callback_log import log_nodes_cb - - # Init variables - base_dir = tempfile.mkdtemp() - log_file = os.path.join(base_dir, 'callback.log') - - # Init logger - logger = logging.getLogger('callback') - logger.setLevel(logging.DEBUG) - handler = logging.FileHandler(log_file) - logger.addHandler(handler) - - # Declare workflow - wf = pe.Workflow(name='test_runtime_prof_cmd') - wf.base_dir = base_dir - - # Input node - input_node = pe.Node(util.IdentityInterface(fields=['num_gb', - 'num_threads']), - name='input_node') - input_node.inputs.num_gb = num_gb - input_node.inputs.num_threads = num_threads - - # Resources used node - resource_node = pe.Node(UseResources(), name='resource_node') - resource_node.interface.estimated_memory_gb = num_gb - resource_node.interface.num_threads = num_threads - - # Connect workflow - wf.connect(input_node, 'num_gb', resource_node, 'num_gb') - wf.connect(input_node, 'num_threads', resource_node, 'num_threads') - - # Run workflow - plugin_args = {'n_procs' : num_threads, - 'memory_gb' : num_gb, - 'status_callback' : log_nodes_cb} - wf.run(plugin='MultiProc', plugin_args=plugin_args) - - # Get runtime stats from log file - with open(log_file, 'r') as log_handle: - lines = log_handle.readlines() - start_str = lines[0].rstrip('\n') - finish_str = lines[1].rstrip('\n') - - # Delete wf base dir - shutil.rmtree(base_dir) - - # Return runtime stats - return start_str, finish_str - - # Test node - def _run_function_workflow(self, num_gb, num_threads): - ''' - Function to run the use_resources() function in a nipype workflow - and return the runtime stats recorded by the profiler - - Parameters - ---------- - self : TestRuntimeProfile - - Returns - ------- - finish_str : string - a json-compatible dictionary string containing the runtime - statistics of the nipype node that used system resources - ''' - - # Import packages - import logging - import os - import shutil - import tempfile - - import nipype.pipeline.engine as pe - import nipype.interfaces.utility as util - from nipype.pipeline.plugins.callback_log import log_nodes_cb - - # Init variables - base_dir = tempfile.mkdtemp() - log_file = os.path.join(base_dir, 'callback.log') - - # Init logger - logger = logging.getLogger('callback') - logger.setLevel(logging.DEBUG) - handler = logging.FileHandler(log_file) - logger.addHandler(handler) - - # Declare workflow - wf = pe.Workflow(name='test_runtime_prof_func') - wf.base_dir = base_dir - - # Input node - input_node = pe.Node(util.IdentityInterface(fields=['num_gb', - 'num_threads']), - name='input_node') - input_node.inputs.num_gb = num_gb - input_node.inputs.num_threads = num_threads - - # Resources used node - resource_node = pe.Node(util.Function(input_names=['num_threads', - 'num_gb'], - output_names=[], - function=use_resources), - name='resource_node') - resource_node.interface.estimated_memory_gb = num_gb - resource_node.interface.num_threads = num_threads - - # Connect workflow - wf.connect(input_node, 'num_gb', resource_node, 'num_gb') - wf.connect(input_node, 'num_threads', resource_node, 'num_threads') - - # Run workflow - plugin_args = {'n_procs' : num_threads, - 'memory_gb' : num_gb, - 'status_callback' : log_nodes_cb} - wf.run(plugin='MultiProc', plugin_args=plugin_args) - - # Get runtime stats from log file - with open(log_file, 'r') as log_handle: - lines = log_handle.readlines() - start_str = lines[0].rstrip('\n') - finish_str = lines[1].rstrip('\n') - - # Delete wf base dir - shutil.rmtree(base_dir) - - # Return runtime stats - return start_str, finish_str - - # Test resources were used as expected in cmdline interface - @pytest.mark.skipif(run_profile == False, reason=skip_profile_msg) - def test_cmdline_profiling(self): - ''' - Test runtime profiler correctly records workflow RAM/CPUs consumption - from a cmdline function - ''' - - # Import packages - import json - import numpy as np - - # Init variables - num_gb = self.num_gb - num_threads = self.num_threads - - # Run workflow and get stats - start_str, finish_str = self._run_cmdline_workflow(num_gb, num_threads) - # Get runtime stats as dictionary - node_stats = json.loads(finish_str) - - # Read out runtime stats - runtime_gb = float(node_stats['runtime_memory_gb']) - runtime_threads = int(node_stats['runtime_threads']) - - # Get margin of error for RAM GB - allowed_gb_err = self.mem_err_gb - runtime_gb_err = np.abs(runtime_gb-num_gb) - # - expected_runtime_threads = num_threads - - # Error message formatting - mem_err = 'Input memory: %f is not within %.3f GB of runtime '\ - 'memory: %f' % (num_gb, self.mem_err_gb, runtime_gb) - threads_err = 'Input threads: %d is not equal to runtime threads: %d' \ - % (expected_runtime_threads, runtime_threads) - - # Assert runtime stats are what was input - assert runtime_gb_err <= allowed_gb_err, mem_err - assert abs(expected_runtime_threads - runtime_threads) <= 1, threads_err - - # Test resources were used as expected - @pytest.mark.skipif(True, reason="https://github.com/nipy/nipype/issues/1663") - @pytest.mark.skipif(run_profile == False, reason=skip_profile_msg) - def test_function_profiling(self): - ''' - Test runtime profiler correctly records workflow RAM/CPUs consumption - from a python function - ''' - - # Import packages - import json - import numpy as np - - # Init variables - num_gb = self.num_gb - num_threads = self.num_threads - - # Run workflow and get stats - start_str, finish_str = self._run_function_workflow(num_gb, num_threads) - # Get runtime stats as dictionary - node_stats = json.loads(finish_str) - - # Read out runtime stats - runtime_gb = float(node_stats['runtime_memory_gb']) - runtime_threads = int(node_stats['runtime_threads']) - - # Get margin of error for RAM GB - allowed_gb_err = self.mem_err_gb - runtime_gb_err = np.abs(runtime_gb-num_gb) - # - expected_runtime_threads = num_threads - - # Error message formatting - mem_err = 'Input memory: %f is not within %.3f GB of runtime '\ - 'memory: %f' % (num_gb, self.mem_err_gb, runtime_gb) - threads_err = 'Input threads: %d is not equal to runtime threads: %d' \ - % (expected_runtime_threads, runtime_threads) - - # Assert runtime stats are what was input - assert runtime_gb_err <= allowed_gb_err, mem_err - assert abs(expected_runtime_threads - runtime_threads) <= 1, threads_err - - diff --git a/nipype/interfaces/tests/use_resources b/nipype/interfaces/tests/use_resources deleted file mode 100755 index 06e2d3e906..0000000000 --- a/nipype/interfaces/tests/use_resources +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python -# -# use_resources - -''' -Python script to use a certain amount of RAM on disk and number of -threads - -Usage: - use_resources -g -p -''' - -# Function to occupy GB of memory -def use_gb_ram(num_gb): - ''' - Function to consume GB of memory - ''' - - # Eat 1 GB of memory for 1 second - gb_str = ' ' * int(num_gb*1024.0**3) - - # Spin CPU - ctr = 0 - while ctr < 30e6: - ctr+= 1 - - # Clear memory - del ctr - del gb_str - - -# Make main executable -if __name__ == '__main__': - - # Import packages - import argparse - from threading import Thread - from multiprocessing import Process - - # Init argparser - parser = argparse.ArgumentParser(description=__doc__) - - # Add arguments - parser.add_argument('-g', '--num_gb', nargs=1, required=True, - help='Number of GB RAM to use, can be float or int') - parser.add_argument('-p', '--num_threads', nargs=1, required=True, - help='Number of threads to run in parallel') - - # Parse args - args = parser.parse_args() - - # Init variables - num_gb = float(args.num_gb[0]) - num_threads = int(args.num_threads[0]) - - # Build thread list - thread_list = [] - for idx in range(num_threads): - thread_list.append(Thread(target=use_gb_ram, args=(num_gb/num_threads,))) - - # Run multi-threaded - print('Using %.3f GB of memory over %d sub-threads...' % \ - (num_gb, num_threads)) - for thread in thread_list: - thread.start() - - for thread in thread_list: - thread.join() diff --git a/nipype/interfaces/utility/wrappers.py b/nipype/interfaces/utility/wrappers.py index 4de11d7ea8..4fa10205d2 100644 --- a/nipype/interfaces/utility/wrappers.py +++ b/nipype/interfaces/utility/wrappers.py @@ -20,21 +20,13 @@ from builtins import str, bytes from ... import logging -from ..base import (traits, DynamicTraitedSpec, Undefined, isdefined, runtime_profile, - BaseInterfaceInputSpec, get_max_resources_used) +from ..base import (traits, DynamicTraitedSpec, Undefined, isdefined, + BaseInterfaceInputSpec) from ..io import IOBase, add_traits from ...utils.filemanip import filename_to_list from ...utils.misc import getsource, create_function_from_source logger = logging.getLogger('interface') -if runtime_profile: - try: - import psutil - except ImportError as exc: - logger.info('Unable to import packages needed for runtime profiling. '\ - 'Turning off runtime profiler. Reason: %s' % exc) - runtime_profile = False - class FunctionInputSpec(DynamicTraitedSpec, BaseInterfaceInputSpec): function_str = traits.Str(mandatory=True, desc='code for function') @@ -137,22 +129,9 @@ def _add_output_traits(self, base): return base def _run_interface(self, runtime): - # Get workflow logger for runtime profile error reporting - logger = logging.getLogger('workflow') - # Create function handle function_handle = create_function_from_source(self.inputs.function_str, self.imports) - - # Wrapper for running function handle in multiprocessing.Process - # Can catch exceptions and report output via multiprocessing.Queue - def _function_handle_wrapper(queue, **kwargs): - try: - out = function_handle(**kwargs) - queue.put(out) - except Exception as exc: - queue.put(exc) - # Get function args args = {} for name in self._input_names: @@ -160,37 +139,7 @@ def _function_handle_wrapper(queue, **kwargs): if isdefined(value): args[name] = value - # Profile resources if set - if runtime_profile: - import multiprocessing - # Init communication queue and proc objs - queue = multiprocessing.Queue() - proc = multiprocessing.Process(target=_function_handle_wrapper, - args=(queue,), kwargs=args) - - # Init memory and threads before profiling - mem_mb = 0 - num_threads = 0 - - # Start process and profile while it's alive - proc.start() - while proc.is_alive(): - mem_mb, num_threads = \ - get_max_resources_used(proc.pid, mem_mb, num_threads, - pyfunc=True) - - # Get result from process queue - out = queue.get() - # If it is an exception, raise it - if isinstance(out, Exception): - raise out - - # Function ran successfully, populate runtime stats - setattr(runtime, 'runtime_memory_gb', mem_mb / 1024.0) - setattr(runtime, 'runtime_threads', num_threads) - else: - out = function_handle(**args) - + out = function_handle(**args) if len(self._output_names) == 1: self._out[self._output_names[0]] = out else: diff --git a/nipype/pipeline/engine/nodes.py b/nipype/pipeline/engine/nodes.py index 622003f8a2..680639ee8f 100644 --- a/nipype/pipeline/engine/nodes.py +++ b/nipype/pipeline/engine/nodes.py @@ -46,8 +46,7 @@ from ...interfaces.base import (traits, InputMultiPath, CommandLine, Undefined, TraitedSpec, DynamicTraitedSpec, Bunch, InterfaceResult, md5, Interface, - TraitDictObject, TraitListObject, isdefined, - runtime_profile) + TraitDictObject, TraitListObject, isdefined) from .utils import (generate_expanded_graph, modify_paths, export_graph, make_output_dir, write_workflow_prov, clean_working_directory, format_dot, topological_sort, @@ -56,6 +55,7 @@ logger = logging.getLogger('workflow') + class Node(EngineBase): """Wraps interface objects for use in pipeline @@ -79,7 +79,7 @@ class Node(EngineBase): def __init__(self, interface, name, iterables=None, itersource=None, synchronize=False, overwrite=None, needed_outputs=None, - run_without_submitting=False, n_procs=1, mem_gb=None, + run_without_submitting=False, n_procs=None, mem_gb=0.20, **kwargs): """ Parameters @@ -153,12 +153,15 @@ def __init__(self, interface, name, iterables=None, itersource=None, if 'base_dir' in kwargs: base_dir = kwargs['base_dir'] super(Node, self).__init__(name, base_dir) + + # Make sure an interface is set, and that it is an Interface if interface is None: raise IOError('Interface must be provided') if not isinstance(interface, Interface): raise IOError('interface must be an instance of an Interface') self._interface = interface self.name = name + self._result = None self.iterables = iterables self.synchronize = synchronize @@ -170,9 +173,10 @@ def __init__(self, interface, name, iterables=None, itersource=None, self.needed_outputs = [] self.plugin_args = {} - self._interface.num_threads = n_procs - if mem_gb is not None: - self._interface.estimated_memory_gb = mem_gb + self._mem_gb = mem_gb + self._n_procs = n_procs + if hasattr(self._interface.inputs, 'num_threads') and self._n_procs is not None: + self._interface.inputs.num_threads = self._n_procs if needed_outputs: self.needed_outputs = sorted(needed_outputs) @@ -202,6 +206,36 @@ def outputs(self): """Return the output fields of the underlying interface""" return self._interface._outputs() + @property + def mem_gb(self): + """Get estimated memory (GB)""" + if hasattr(self._interface, 'estimated_memory_gb'): + self._mem_gb = self._interface.estimated_memory_gb + logger.warning('Setting "estimated_memory_gb" on Interfaces has been ' + 'deprecated as of nipype 1.0, please use Node.mem_gb.') + + return self._mem_gb + + @property + def n_procs(self): + """Get the estimated number of processes/threads""" + if self._n_procs is not None: + return self._n_procs + elif hasattr(self._interface.inputs, 'num_threads') and isdefined( + self._interface.inputs.num_threads): + return self._interface.inputs.num_threads + else: + return 1 + + @n_procs.setter + def n_procs(self, value): + """Set an estimated number of processes/threads""" + self._n_procs = value + + # Overwrite interface's dynamic input of num_threads + if hasattr(self._interface.inputs, 'num_threads'): + self._interface.inputs.num_threads = self._n_procs + def output_dir(self): """Return the location of the output directory for the node""" if self.base_dir is None: @@ -271,6 +305,7 @@ def run(self, updatehash=False): Update the hash stored in the output directory """ # check to see if output directory and hash exist + if self.config is None: self.config = deepcopy(config._sections) else: @@ -279,7 +314,7 @@ def run(self, updatehash=False): self._get_inputs() self._got_inputs = True outdir = self.output_dir() - logger.info("Executing node %s in dir: %s", self._id, outdir) + logger.info("Executing node %s in dir: %s", self.fullname, outdir) if op.exists(outdir): logger.debug('Output dir: %s', to_str(os.listdir(outdir))) hash_info = self.hash_exists(updatehash=updatehash) @@ -595,9 +630,10 @@ def _run_command(self, execute, copyfiles=True): runtime=runtime, inputs=self._interface.inputs.get_traitsfree()) self._result = result - logger.debug('Executing node') if copyfiles: self._copyfiles_to_wd(cwd, execute) + + message = 'Running a "%s" interface' if issubclass(self._interface.__class__, CommandLine): try: cmd = self._interface.cmdline @@ -605,10 +641,10 @@ def _run_command(self, execute, copyfiles=True): self._result.runtime.stderr = msg raise cmdfile = op.join(cwd, 'command.txt') - fd = open(cmdfile, 'wt') - fd.writelines(cmd + "\n") - fd.close() - logger.info('Running: %s' % cmd) + with open(cmdfile, 'wt') as fd: + print(cmd + "\n", file=fd) + message += ', a CommandLine Interface with command:\n%s' % cmd + logger.info(message + '.', self._interface.__class__.__name__) try: result = self._interface.run() except Exception as msg: @@ -724,15 +760,13 @@ def write_report(self, report_type=None, cwd=None): return fp.writelines(write_rst_header('Runtime info', level=1)) # Init rst dictionary of runtime stats - rst_dict = {'hostname' : self.result.runtime.hostname, - 'duration' : self.result.runtime.duration} + rst_dict = {'hostname': self.result.runtime.hostname, + 'duration': self.result.runtime.duration} # Try and insert memory/threads usage if available - if runtime_profile: - try: - rst_dict['runtime_memory_gb'] = self.result.runtime.runtime_memory_gb - rst_dict['runtime_threads'] = self.result.runtime.runtime_threads - except AttributeError: - logger.info('Runtime memory and threads stats unavailable') + if config.resource_monitor: + rst_dict['mem_peak_gb'] = self.result.runtime.mem_peak_gb + rst_dict['cpu_percent'] = self.result.runtime.cpu_percent + if hasattr(self.result.runtime, 'cmdline'): rst_dict['command'] = self.result.runtime.cmdline fp.writelines(write_rst_dict(rst_dict)) @@ -1113,8 +1147,8 @@ def _make_nodes(self, cwd=None): for i in range(nitems): nodename = '_' + self.name + str(i) node = Node(deepcopy(self._interface), - n_procs=self._interface.num_threads, - mem_gb=self._interface.estimated_memory_gb, + n_procs=self._n_procs, + mem_gb=self._mem_gb, overwrite=self.overwrite, needed_outputs=self.needed_outputs, run_without_submitting=self.run_without_submitting, diff --git a/nipype/pipeline/engine/tests/test_engine.py b/nipype/pipeline/engine/tests/test_engine.py index 90d566ddf9..6bfffdfbeb 100644 --- a/nipype/pipeline/engine/tests/test_engine.py +++ b/nipype/pipeline/engine/tests/test_engine.py @@ -475,7 +475,7 @@ def double_func(x): def test_mapnode_nested(tmpdir): - os.chdir(str(tmpdir)) + tmpdir.chdir() from nipype import MapNode, Function def func1(in1): @@ -505,7 +505,7 @@ def func1(in1): def test_mapnode_expansion(tmpdir): - os.chdir(str(tmpdir)) + tmpdir.chdir() from nipype import MapNode, Function def func1(in1): @@ -513,23 +513,22 @@ def func1(in1): mapnode = MapNode(Function(function=func1), iterfield='in1', - name='mapnode') + name='mapnode', + n_procs=2, + mem_gb=2) mapnode.inputs.in1 = [1, 2] - mapnode.interface.num_threads = 2 - mapnode.interface.estimated_memory_gb = 2 for idx, node in mapnode._make_nodes(): for attr in ('overwrite', 'run_without_submitting', 'plugin_args'): assert getattr(node, attr) == getattr(mapnode, attr) - for attr in ('num_threads', 'estimated_memory_gb'): - assert (getattr(node._interface, attr) == - getattr(mapnode._interface, attr)) + for attr in ('_n_procs', '_mem_gb'): + assert (getattr(node, attr) == + getattr(mapnode, attr)) def test_node_hash(tmpdir): - wd = str(tmpdir) - os.chdir(wd) from nipype.interfaces.utility import Function + tmpdir.chdir() def func1(): return 1 @@ -548,13 +547,13 @@ def func2(a): modify = lambda x: x + 1 n1.inputs.a = 1 w1.connect(n1, ('a', modify), n2, 'a') - w1.base_dir = wd + w1.base_dir = os.getcwd() # generate outputs w1.run(plugin='Linear') # ensure plugin is being called w1.config['execution'] = {'stop_on_first_crash': 'true', 'local_hash_check': 'false', - 'crashdump_dir': wd} + 'crashdump_dir': os.getcwd()} # create dummy distributed plugin class from nipype.pipeline.plugins.base import DistributedPluginBase @@ -576,14 +575,14 @@ def _submit_job(self, node, updatehash=False): # set local check w1.config['execution'] = {'stop_on_first_crash': 'true', 'local_hash_check': 'true', - 'crashdump_dir': wd} + 'crashdump_dir': os.getcwd()} w1.run(plugin=RaiseError()) def test_old_config(tmpdir): - wd = str(tmpdir) - os.chdir(wd) + tmpdir.chdir() + wd = os.getcwd() from nipype.interfaces.utility import Function def func1(): @@ -614,8 +613,8 @@ def func2(a): def test_mapnode_json(tmpdir): """Tests that mapnodes don't generate excess jsons """ - wd = str(tmpdir) - os.chdir(wd) + tmpdir.chdir() + wd = os.getcwd() from nipype import MapNode, Function, Workflow def func1(in1): @@ -671,8 +670,8 @@ def test_parameterize_dirs_false(tmpdir): def test_serial_input(tmpdir): - wd = str(tmpdir) - os.chdir(wd) + tmpdir.chdir() + wd = os.getcwd() from nipype import MapNode, Function, Workflow def func1(in1): @@ -708,7 +707,7 @@ def func1(in1): def test_write_graph_runs(tmpdir): - os.chdir(str(tmpdir)) + tmpdir.chdir() for graph in ('orig', 'flat', 'exec', 'hierarchical', 'colored'): for simple in (True, False): @@ -736,7 +735,7 @@ def test_write_graph_runs(tmpdir): def test_deep_nested_write_graph_runs(tmpdir): - os.chdir(str(tmpdir)) + tmpdir.chdir() for graph in ('orig', 'flat', 'exec', 'hierarchical', 'colored'): for simple in (True, False): diff --git a/nipype/pipeline/engine/utils.py b/nipype/pipeline/engine/utils.py index f677d6c253..5d2291b6fa 100644 --- a/nipype/pipeline/engine/utils.py +++ b/nipype/pipeline/engine/utils.py @@ -37,6 +37,7 @@ from ... import logging, config logger = logging.getLogger('workflow') +PY3 = sys.version_info[0] > 2 try: dfs_preorder = nx.dfs_preorder @@ -1022,7 +1023,8 @@ def export_graph(graph_in, base_dir=None, show=False, use_execgraph=False, _write_detailed_dot(graph, outfname) if format != 'dot': cmd = 'dot -T%s -O %s' % (format, outfname) - res = CommandLine(cmd, terminal_output='allatonce').run() + res = CommandLine(cmd, terminal_output='allatonce', + resource_monitor=False).run() if res.runtime.returncode: logger.warn('dot2png: %s', res.runtime.stderr) pklgraph = _create_dot_graph(graph, show_connectinfo, simple_form) @@ -1033,7 +1035,8 @@ def export_graph(graph_in, base_dir=None, show=False, use_execgraph=False, nx.drawing.nx_pydot.write_dot(pklgraph, simplefname) if format != 'dot': cmd = 'dot -T%s -O %s' % (format, simplefname) - res = CommandLine(cmd, terminal_output='allatonce').run() + res = CommandLine(cmd, terminal_output='allatonce', + resource_monitor=False).run() if res.runtime.returncode: logger.warn('dot2png: %s', res.runtime.stderr) if show: @@ -1053,7 +1056,7 @@ def format_dot(dotfilename, format='png'): if format != 'dot': cmd = 'dot -T%s -O \'%s\'' % (format, dotfilename) try: - CommandLine(cmd).run() + CommandLine(cmd, resource_monitor=False).run() except IOError as ioe: if "could not be found" in str(ioe): raise IOError("Cannot draw directed graph; executable 'dot' is unavailable") @@ -1079,7 +1082,7 @@ def make_output_dir(outdir): except OSError: logger.debug("Problem creating %s", outdir) if not os.path.exists(outdir): - raise OSError('Could not create %s', outdir) + raise OSError('Could not create %s' % outdir) return outdir @@ -1266,7 +1269,7 @@ def write_workflow_prov(graph, filename=None, format='all'): ps.g.add_bundle(sub_bundle) bundle_entity = ps.g.entity(sub_bundle.identifier, other_attributes={'prov:type': - pm.PROV_BUNDLE}) + pm.PROV_BUNDLE}) ps.g.wasGeneratedBy(bundle_entity, process) else: process.add_attributes({pm.PROV["type"]: nipype_ns["Node"]}) @@ -1279,7 +1282,7 @@ def write_workflow_prov(graph, filename=None, format='all'): ps.g.add_bundle(result_bundle) bundle_entity = ps.g.entity(result_bundle.identifier, other_attributes={'prov:type': - pm.PROV_BUNDLE}) + pm.PROV_BUNDLE}) ps.g.wasGeneratedBy(bundle_entity, process) processes.append(process) @@ -1294,6 +1297,61 @@ def write_workflow_prov(graph, filename=None, format='all'): return ps.g +def write_workflow_resources(graph, filename=None): + """ + Generate a JSON file with profiling traces that can be loaded + in a pandas DataFrame or processed with JavaScript like D3.js + """ + import simplejson as json + if not filename: + filename = os.path.join(os.getcwd(), 'resource_monitor.json') + + big_dict = { + 'time': [], + 'name': [], + 'interface': [], + 'mem_gb': [], + 'cpus': [], + 'mapnode': [], + 'params': [], + } + + for idx, node in enumerate(graph.nodes()): + nodename = node.fullname + classname = node._interface.__class__.__name__ + + params = '' + if node.parameterization: + params = '_'.join(['{}'.format(p) + for p in node.parameterization]) + + try: + rt_list = node.result.runtime + except Exception: + logger.warning('Could not access runtime info for node %s' + ' (%s interface)', nodename, classname) + continue + + if not isinstance(rt_list, list): + rt_list = [rt_list] + + for subidx, runtime in enumerate(rt_list): + nsamples = len(runtime.prof_dict['time']) + + for key in ['time', 'mem_gb', 'cpus']: + big_dict[key] += runtime.prof_dict[key] + + big_dict['interface'] += [classname] * nsamples + big_dict['name'] += [nodename] * nsamples + big_dict['mapnode'] += [subidx] * nsamples + big_dict['params'] += [params] * nsamples + + with open(filename, 'w' if PY3 else 'wb') as rsf: + json.dump(big_dict, rsf, ensure_ascii=False) + + return filename + + def topological_sort(graph, depth_first=False): """Returns a depth first sorted order if depth_first is True """ diff --git a/nipype/pipeline/engine/workflows.py b/nipype/pipeline/engine/workflows.py index 17d49b046a..35f1f7df3b 100644 --- a/nipype/pipeline/engine/workflows.py +++ b/nipype/pipeline/engine/workflows.py @@ -51,6 +51,7 @@ write_rst_list, to_str) from .utils import (generate_expanded_graph, modify_paths, export_graph, make_output_dir, write_workflow_prov, + write_workflow_resources, clean_working_directory, format_dot, topological_sort, get_print_name, merge_dict, evaluate_connect_function, _write_inputs, format_node) @@ -593,6 +594,9 @@ def run(self, plugin=None, plugin_args=None, updatehash=False): 'workflow_provenance_%s' % datestr) logger.info('Provenance file prefix: %s' % prov_base) write_workflow_prov(execgraph, prov_base, format='all') + + if config.resource_monitor: + write_workflow_resources(execgraph) return execgraph # PRIVATE API AND FUNCTIONS diff --git a/nipype/pipeline/plugins/API.rst b/nipype/pipeline/plugins/API.rst deleted file mode 100644 index 57ef2632bc..0000000000 --- a/nipype/pipeline/plugins/API.rst +++ /dev/null @@ -1,8 +0,0 @@ -Execution plugin API -==================== - -Current status: - -class plugin_runner(PluginBase): - - def run(graph, config, updatehash) diff --git a/nipype/pipeline/plugins/__init__.py b/nipype/pipeline/plugins/__init__.py index cb2c193004..34d3abdebc 100644 --- a/nipype/pipeline/plugins/__init__.py +++ b/nipype/pipeline/plugins/__init__.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- # vi: set ft=python sts=4 ts=4 sw=4 et: +from __future__ import print_function, division, unicode_literals, absolute_import from .debug import DebugPlugin from .linear import LinearPlugin @@ -19,5 +20,4 @@ from .slurm import SLURMPlugin from .slurmgraph import SLURMGraphPlugin -from .callback_log import log_nodes_cb from . import semaphore_singleton diff --git a/nipype/pipeline/plugins/base.py b/nipype/pipeline/plugins/base.py index 7334e00c52..4733cece7b 100644 --- a/nipype/pipeline/plugins/base.py +++ b/nipype/pipeline/plugins/base.py @@ -9,211 +9,88 @@ from copy import deepcopy from glob import glob import os -import getpass import shutil -from socket import gethostname import sys -import uuid -from time import strftime, sleep, time -from traceback import format_exception, format_exc -from warnings import warn +from time import sleep, time +from traceback import format_exc import numpy as np import scipy.sparse as ssp - from ... import logging -from ...utils.filemanip import savepkl, loadpkl, crash2txt +from ...utils.filemanip import loadpkl from ...utils.misc import str2bool from ..engine.utils import (nx, dfs_preorder, topological_sort) from ..engine import MapNode - +from .tools import report_crash, report_nodes_not_run, create_pyscript logger = logging.getLogger('workflow') -iflogger = logging.getLogger('interface') -def report_crash(node, traceback=None, hostname=None): - """Writes crash related information to a file +class PluginBase(object): """ - name = node._id - if node.result and hasattr(node.result, 'runtime') and \ - node.result.runtime: - if isinstance(node.result.runtime, list): - host = node.result.runtime[0].hostname - else: - host = node.result.runtime.hostname - else: - if hostname: - host = hostname - else: - host = gethostname() - message = ['Node %s failed to run on host %s.' % (name, - host)] - logger.error(message) - if not traceback: - exc_type, exc_value, exc_traceback = sys.exc_info() - traceback = format_exception(exc_type, - exc_value, - exc_traceback) - timeofcrash = strftime('%Y%m%d-%H%M%S') - login_name = getpass.getuser() - crashfile = 'crash-%s-%s-%s-%s' % (timeofcrash, - login_name, - name, - str(uuid.uuid4())) - crashdir = node.config['execution']['crashdump_dir'] - if crashdir is None: - crashdir = os.getcwd() - if not os.path.exists(crashdir): - os.makedirs(crashdir) - crashfile = os.path.join(crashdir, crashfile) - if node.config['execution']['crashfile_format'].lower() in ['text', 'txt']: - crashfile += '.txt' - else: - crashfile += '.pklz' - logger.info('Saving crash info to %s' % crashfile) - logger.info(''.join(traceback)) - if node.config['execution']['crashfile_format'].lower() in ['text', 'txt']: - crash2txt(crashfile, dict(node=node, traceback=traceback)) - else: - savepkl(crashfile, dict(node=node, traceback=traceback)) - return crashfile - - -def report_nodes_not_run(notrun): - """List nodes that crashed with crashfile info - - Optionally displays dependent nodes that weren't executed as a result of - the crash. + Base class for plugins + """ - if notrun: - logger.info("***********************************") - for info in notrun: - logger.error("could not run node: %s" % - '.'.join((info['node']._hierarchy, - info['node']._id))) - logger.info("crashfile: %s" % info['crashfile']) - logger.debug("The following dependent nodes were not run") - for subnode in info['dependents']: - logger.debug(subnode._id) - logger.info("***********************************") - raise RuntimeError(('Workflow did not execute cleanly. ' - 'Check log for details')) - - -def create_pyscript(node, updatehash=False, store_exception=True): - # pickle node - timestamp = strftime('%Y%m%d_%H%M%S') - if node._hierarchy: - suffix = '%s_%s_%s' % (timestamp, node._hierarchy, node._id) - batch_dir = os.path.join(node.base_dir, - node._hierarchy.split('.')[0], - 'batch') - else: - suffix = '%s_%s' % (timestamp, node._id) - batch_dir = os.path.join(node.base_dir, 'batch') - if not os.path.exists(batch_dir): - os.makedirs(batch_dir) - pkl_file = os.path.join(batch_dir, 'node_%s.pklz' % suffix) - savepkl(pkl_file, dict(node=node, updatehash=updatehash)) - mpl_backend = node.config["execution"]["matplotlib_backend"] - # create python script to load and trap exception - cmdstr = """import os -import sys -can_import_matplotlib = True #Silently allow matplotlib to be ignored -try: - import matplotlib - matplotlib.use('%s') -except ImportError: - can_import_matplotlib = False - pass - -from nipype import config, logging -from nipype.utils.filemanip import loadpkl, savepkl -from socket import gethostname -from traceback import format_exception -info = None -pklfile = '%s' -batchdir = '%s' -from nipype.utils.filemanip import loadpkl, savepkl -try: - if not sys.version_info < (2, 7): - from collections import OrderedDict - config_dict=%s - config.update_config(config_dict) - ## Only configure matplotlib if it was successfully imported, matplotlib is an optional component to nipype - if can_import_matplotlib: - config.update_matplotlib() - logging.update_logging(config) - traceback=None - cwd = os.getcwd() - info = loadpkl(pklfile) - result = info['node'].run(updatehash=info['updatehash']) -except Exception as e: - etype, eval, etr = sys.exc_info() - traceback = format_exception(etype,eval,etr) - if info is None or not os.path.exists(info['node'].output_dir()): - result = None - resultsfile = os.path.join(batchdir, 'crashdump_%s.pklz') - else: - result = info['node'].result - resultsfile = os.path.join(info['node'].output_dir(), - 'result_%%s.pklz'%%info['node'].name) -""" - if store_exception: - cmdstr += """ - savepkl(resultsfile, dict(result=result, hostname=gethostname(), - traceback=traceback)) -""" - else: - cmdstr += """ - if info is None: - savepkl(resultsfile, dict(result=result, hostname=gethostname(), - traceback=traceback)) - else: - from nipype.pipeline.plugins.base import report_crash - report_crash(info['node'], traceback, gethostname()) - raise Exception(e) -""" - cmdstr = cmdstr % (mpl_backend, pkl_file, batch_dir, node.config, suffix) - pyscript = os.path.join(batch_dir, 'pyscript_%s.py' % suffix) - with open(pyscript, 'wt') as fp: - fp.writelines(cmdstr) - return pyscript + def __init__(self, plugin_args=None): + if plugin_args is None: + plugin_args = {} + self.plugin_args = plugin_args + self._config = None + self._status_callback = plugin_args.get('status_callback') + def run(self, graph, config, updatehash=False): + """ + The core plugin member that should be implemented by + all plugins. -class PluginBase(object): - """Base class for plugins""" + graph: a networkx, flattened :abbr:`DAG (Directed Acyclic Graph)` + to be executed - def __init__(self, plugin_args=None): - if plugin_args and 'status_callback' in plugin_args: - self._status_callback = plugin_args['status_callback'] - else: - self._status_callback = None - return + config: a nipype.config object - def run(self, graph, config, updatehash=False): + updatehash: + + """ raise NotImplementedError class DistributedPluginBase(PluginBase): - """Execute workflow with a distribution engine + """ + Execute workflow with a distribution engine + + Relevant class attributes + ------------------------- + + procs: list (N) of underlying interface elements to be processed + proc_done: a boolean numpy array (N,) signifying whether a process has been + submitted for execution + proc_pending: a boolean numpy array (N,) signifying whether a + process is currently running. + depidx: a boolean matrix (NxN) storing the dependency structure accross + processes. Process dependencies are derived from each column. + + Combinations of ``proc_done`` and ``proc_pending`` + -------------------------------------------------- + + +------------+---------------+--------------------------------+ + | proc_done | proc_pending | outcome | + +============+===============+================================+ + | True | False | Process is finished | + +------------+---------------+--------------------------------+ + | True | True | Process is currently being run | + +------------+---------------+--------------------------------+ + | False | False | Process is queued | + +------------+---------------+--------------------------------+ + | False | True | INVALID COMBINATION | + +------------+---------------+--------------------------------+ """ def __init__(self, plugin_args=None): - """Initialize runtime attributes to none - - procs: list (N) of underlying interface elements to be processed - proc_done: a boolean vector (N) signifying whether a process has been - executed - proc_pending: a boolean vector (N) signifying whether a - process is currently running. Note: A process is finished only when - both proc_done==True and - proc_pending==False - depidx: a boolean matrix (NxN) storing the dependency structure accross - processes. Process dependencies are derived from each column. + """ + Initialize runtime attributes to none + """ super(DistributedPluginBase, self).__init__(plugin_args=plugin_args) self.procs = None @@ -223,32 +100,55 @@ def __init__(self, plugin_args=None): self.mapnodesubids = None self.proc_done = None self.proc_pending = None - self.max_jobs = np.inf - if plugin_args and 'max_jobs' in plugin_args: - self.max_jobs = plugin_args['max_jobs'] + self.pending_tasks = [] + self.max_jobs = self.plugin_args.get('max_jobs', np.inf) + + def _prerun_check(self, graph): + """Stub method to validate/massage graph and nodes before running""" + + def _postrun_check(self): + """Stub method to close any open resources""" def run(self, graph, config, updatehash=False): - """Executes a pre-defined pipeline using distributed approaches + """ + Executes a pre-defined pipeline using distributed approaches """ logger.info("Running in parallel.") self._config = config + poll_sleep_secs = float(config['execution']['poll_sleep_duration']) + + self._prerun_check(graph) # Generate appropriate structures for worker-manager model self._generate_dependency_list(graph) - self.pending_tasks = [] - self.readytorun = [] self.mapnodes = [] self.mapnodesubids = {} # setup polling - TODO: change to threaded model notrun = [] - while np.any(self.proc_done == False) | \ - np.any(self.proc_pending == True): + while not np.all(self.proc_done) or np.any(self.proc_pending): + # Check to see if a job is available (jobs without dependencies not run) + # See https://github.com/nipy/nipype/pull/2200#discussion_r141605722 + jobs_ready = np.nonzero(~self.proc_done & (self.depidx.sum(0) == 0))[1] + + logger.info('Progress: %d jobs, %d/%d/%d (done/running/ready),' + ' %d/%d (pending_tasks/waiting).', + len(self.proc_done), + np.sum(self.proc_done ^ self.proc_pending), + np.sum(self.proc_done & self.proc_pending), + len(jobs_ready), + len(self.pending_tasks), + np.sum(~self.proc_done & ~self.proc_pending)) toappend = [] # trigger callbacks for any pending results while self.pending_tasks: taskid, jobid = self.pending_tasks.pop() try: result = self._get_result(taskid) + except Exception: + notrun.append(self._clean_queue( + jobid, graph, result={'result': None, + 'traceback': format_exc()})) + else: if result: if result['traceback']: notrun.append(self._clean_queue(jobid, graph, @@ -258,36 +158,27 @@ def run(self, graph, config, updatehash=False): self._remove_node_dirs() self._clear_task(taskid) else: + assert self.proc_done[jobid] and self.proc_pending[jobid] toappend.insert(0, (taskid, jobid)) - except Exception: - result = {'result': None, - 'traceback': format_exc()} - notrun.append(self._clean_queue(jobid, graph, - result=result)) + if toappend: self.pending_tasks.extend(toappend) num_jobs = len(self.pending_tasks) - logger.debug('Number of pending tasks: %d' % num_jobs) + logger.debug('Tasks currently running: %d. Pending: %d.', num_jobs, + np.sum(self.proc_done & self.proc_pending)) if num_jobs < self.max_jobs: self._send_procs_to_workers(updatehash=updatehash, graph=graph) else: - logger.debug('Not submitting') - self._wait() + logger.debug('Not submitting (max jobs reached)') + + sleep(poll_sleep_secs) self._remove_node_dirs() report_nodes_not_run(notrun) # close any open resources - self._close() - - def _wait(self): - sleep(float(self._config['execution']['poll_sleep_duration'])) - - def _close(self): - # close any open resources, this could raise NotImplementedError - # but I didn't want to break other plugins - return True + self._postrun_check() def _get_result(self, taskid): raise NotImplementedError @@ -296,18 +187,26 @@ def _submit_job(self, node, updatehash=False): raise NotImplementedError def _report_crash(self, node, result=None): - raise NotImplementedError + tb = None + if result is not None: + node._result = result['result'] + tb = result['traceback'] + node._traceback = tb + return report_crash(node, traceback=tb) def _clear_task(self, taskid): raise NotImplementedError def _clean_queue(self, jobid, graph, result=None): + logger.debug('Clearing %d from queue', jobid) + + if self._status_callback: + self._status_callback(self.procs[jobid], 'exception') + if str2bool(self._config['execution']['stop_on_first_crash']): raise RuntimeError("".join(result['traceback'])) crashfile = self._report_crash(self.procs[jobid], result=result) - if self._status_callback: - self._status_callback(self.procs[jobid], 'exception') if jobid in self.mapnodesubids: # remove current jobid self.proc_pending[jobid] = False @@ -325,8 +224,8 @@ def _submit_mapnode(self, jobid): self.mapnodes.append(jobid) mapnodesubids = self.procs[jobid].get_subnodes() numnodes = len(mapnodesubids) - logger.info('Adding %d jobs for mapnode %s' % (numnodes, - self.procs[jobid]._id)) + logger.debug('Adding %d jobs for mapnode %s', + numnodes, self.procs[jobid]._id) for i in range(numnodes): self.mapnodesubids[self.depidx.shape[0] + i] = jobid self.procs.extend(mapnodesubids) @@ -347,9 +246,11 @@ def _submit_mapnode(self, jobid): return False def _send_procs_to_workers(self, updatehash=False, graph=None): - """ Sends jobs to workers """ - while np.any(self.proc_done == False): + Sends jobs to workers + """ + + while not np.all(self.proc_done): num_jobs = len(self.pending_tasks) if np.isinf(self.max_jobs): slots = None @@ -358,15 +259,16 @@ def _send_procs_to_workers(self, updatehash=False, graph=None): logger.debug('Slots available: %s' % slots) if (num_jobs >= self.max_jobs) or (slots == 0): break - # Check to see if a job is available - jobids = np.flatnonzero((self.proc_done == False) & - (self.depidx.sum(axis=0) == 0).__array__()) + + # Check to see if a job is available (jobs without dependencies not run) + # See https://github.com/nipy/nipype/pull/2200#discussion_r141605722 + jobids = np.nonzero(~self.proc_done & (self.depidx.sum(0) == 0))[1] + if len(jobids) > 0: # send all available jobs - if slots: - logger.info('Pending[%d] Submitting[%d] jobs Slots[%d]' % (num_jobs, len(jobids[:slots]), slots)) - else: - logger.info('Pending[%d] Submitting[%d] jobs Slots[inf]' % (num_jobs, len(jobids))) + logger.info('Pending[%d] Submitting[%d] jobs Slots[%d]', + num_jobs, len(jobids[:slots]), slots or 'inf') + for jobid in jobids[:slots]: if isinstance(self.procs[jobid], MapNode): try: @@ -387,27 +289,8 @@ def _send_procs_to_workers(self, updatehash=False, graph=None): (self.procs[jobid]._id, jobid)) if self._status_callback: self._status_callback(self.procs[jobid], 'start') - continue_with_submission = True - if str2bool(self.procs[jobid].config['execution'] - ['local_hash_check']): - logger.debug('checking hash locally') - try: - hash_exists, _, _, _ = self.procs[ - jobid].hash_exists() - logger.debug('Hash exists %s' % str(hash_exists)) - if (hash_exists and (self.procs[jobid].overwrite is False or - (self.procs[jobid].overwrite is None and not - self.procs[jobid]._interface.always_run))): - continue_with_submission = False - self._task_finished_cb(jobid) - self._remove_node_dirs() - except Exception: - self._clean_queue(jobid, graph) - self.proc_pending[jobid] = False - continue_with_submission = False - logger.debug('Finished checking hash %s' % - str(continue_with_submission)) - if continue_with_submission: + + if not self._local_hash_check(jobid, graph): if self.procs[jobid].run_without_submitting: logger.debug('Running node %s on master thread' % self.procs[jobid]) @@ -430,6 +313,31 @@ def _send_procs_to_workers(self, updatehash=False, graph=None): else: break + def _local_hash_check(self, jobid, graph): + if not str2bool(self.procs[jobid].config['execution']['local_hash_check']): + return False + + logger.debug('Checking hash (%d) locally', jobid) + + hash_exists, _, _, _ = self.procs[jobid].hash_exists() + overwrite = self.procs[jobid].overwrite + always_run = self.procs[jobid]._interface.always_run + + if hash_exists and (overwrite is False or + overwrite is None and not always_run): + logger.debug('Skipping cached node %s with ID %s.', + self.procs[jobid]._id, jobid) + try: + self._task_finished_cb(jobid) + self._remove_node_dirs() + except Exception: + logger.debug('Error skipping cached node %s (%s).', + self.procs[jobid]._id, jobid) + self._clean_queue(jobid, graph) + self.proc_pending[jobid] = False + return True + return False + def _task_finished_cb(self, jobid): """ Extract outputs and assign to inputs of dependent tasks @@ -583,15 +491,6 @@ def _submit_job(self, node, updatehash=False): fp.writelines(batchscript) return self._submit_batchtask(batchscriptfile, node) - def _report_crash(self, node, result=None): - if result and result['traceback']: - node._result = result['result'] - node._traceback = result['traceback'] - return report_crash(node, - traceback=result['traceback']) - else: - return report_crash(node) - def _clear_task(self, taskid): del self._pending[taskid] @@ -601,8 +500,8 @@ class GraphPluginBase(PluginBase): """ def __init__(self, plugin_args=None): - if plugin_args and 'status_callback' in plugin_args: - warn('status_callback not supported for Graph submission plugins') + if plugin_args and plugin_args.get('status_callback'): + logger.warning('status_callback not supported for Graph submission plugins') super(GraphPluginBase, self).__init__(plugin_args=plugin_args) def run(self, graph, config, updatehash=False): diff --git a/nipype/pipeline/plugins/callback_log.py b/nipype/pipeline/plugins/callback_log.py deleted file mode 100644 index 5ddc9eedd5..0000000000 --- a/nipype/pipeline/plugins/callback_log.py +++ /dev/null @@ -1,67 +0,0 @@ -# -*- coding: utf-8 -*- -# emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- -# vi: set ft=python sts=4 ts=4 sw=4 et: -"""Callback logger for recording workflow and node run stats -""" -from __future__ import print_function, division, unicode_literals, absolute_import - - -# Log node stats function -def log_nodes_cb(node, status): - """Function to record node run statistics to a log file as json - dictionaries - - Parameters - ---------- - node : nipype.pipeline.engine.Node - the node being logged - status : string - acceptable values are 'start', 'end'; otherwise it is - considered and error - - Returns - ------- - None - this function does not return any values, it logs the node - status info to the callback logger - """ - - # Import packages - import datetime - import logging - import json - - # Check runtime profile stats - if node.result is not None: - try: - runtime = node.result.runtime - runtime_memory_gb = runtime.runtime_memory_gb - runtime_threads = runtime.runtime_threads - except AttributeError: - runtime_memory_gb = runtime_threads = 'Unknown' - else: - runtime_memory_gb = runtime_threads = 'N/A' - - # Init variables - logger = logging.getLogger('callback') - status_dict = {'name' : node.name, - 'id' : node._id, - 'estimated_memory_gb' : node._interface.estimated_memory_gb, - 'num_threads' : node._interface.num_threads} - - # Check status and write to log - # Start - if status == 'start': - status_dict['start'] = str(datetime.datetime.now()) - # End - elif status == 'end': - status_dict['finish'] = str(datetime.datetime.now()) - status_dict['runtime_threads'] = runtime_threads - status_dict['runtime_memory_gb'] = runtime_memory_gb - # Other - else: - status_dict['finish'] = str(datetime.datetime.now()) - status_dict['error'] = True - - # Dump string to log - logger.debug(json.dumps(status_dict)) diff --git a/nipype/pipeline/plugins/condor.py b/nipype/pipeline/plugins/condor.py index 9b8b5c218d..0548a7afbc 100644 --- a/nipype/pipeline/plugins/condor.py +++ b/nipype/pipeline/plugins/condor.py @@ -7,7 +7,9 @@ from time import sleep from ...interfaces.base import CommandLine -from .base import (SGELikeBatchManagerBase, logger, iflogger, logging) +from ... import logging +from .base import SGELikeBatchManagerBase, logger +iflogger = logging.getLogger('interface') class CondorPlugin(SGELikeBatchManagerBase): @@ -46,6 +48,7 @@ def __init__(self, **kwargs): def _is_pending(self, taskid): cmd = CommandLine('condor_q', + resource_monitor=False, terminal_output='allatonce') cmd.inputs.args = '%d' % taskid # check condor cluster @@ -59,6 +62,7 @@ def _is_pending(self, taskid): def _submit_batchtask(self, scriptfile, node): cmd = CommandLine('condor_qsub', environ=dict(os.environ), + resource_monitor=False, terminal_output='allatonce') path = os.path.dirname(scriptfile) qsubargs = '' diff --git a/nipype/pipeline/plugins/dagman.py b/nipype/pipeline/plugins/dagman.py index 1001ab5dac..ce2a2a5592 100644 --- a/nipype/pipeline/plugins/dagman.py +++ b/nipype/pipeline/plugins/dagman.py @@ -10,7 +10,7 @@ import time from warnings import warn -from .base import (GraphPluginBase, logger) +from .base import GraphPluginBase, logger from ...interfaces.base import CommandLine @@ -154,6 +154,7 @@ def _submit_graph(self, pyfiles, dependencies, nodes): child)) # hand over DAG to condor_dagman cmd = CommandLine('condor_submit_dag', environ=dict(os.environ), + resource_monitor=False, terminal_output='allatonce') # needs -update_submit or re-running a workflow will fail cmd.inputs.args = '%s -update_submit %s' % (self._dagman_args, diff --git a/nipype/pipeline/plugins/lsf.py b/nipype/pipeline/plugins/lsf.py index 6e27b3ab95..5ee0483221 100644 --- a/nipype/pipeline/plugins/lsf.py +++ b/nipype/pipeline/plugins/lsf.py @@ -7,8 +7,10 @@ import re from time import sleep -from .base import (SGELikeBatchManagerBase, logger, iflogger, logging) +from ... import logging from ...interfaces.base import CommandLine +from .base import SGELikeBatchManagerBase, logger +iflogger = logging.getLogger('interface') class LSFPlugin(SGELikeBatchManagerBase): @@ -45,6 +47,7 @@ def _is_pending(self, taskid): finished and is ready to be checked for completeness. So return True if status is either 'PEND' or 'RUN'""" cmd = CommandLine('bjobs', + resource_monitor=False, terminal_output='allatonce') cmd.inputs.args = '%d' % taskid # check lsf task @@ -60,6 +63,7 @@ def _is_pending(self, taskid): def _submit_batchtask(self, scriptfile, node): cmd = CommandLine('bsub', environ=dict(os.environ), + resource_monitor=False, terminal_output='allatonce') path = os.path.dirname(scriptfile) bsubargs = '' diff --git a/nipype/pipeline/plugins/multiproc.py b/nipype/pipeline/plugins/multiproc.py index 3994f2e1cd..ecbb8a4a70 100644 --- a/nipype/pipeline/plugins/multiproc.py +++ b/nipype/pipeline/plugins/multiproc.py @@ -7,25 +7,24 @@ http://stackoverflow.com/a/8963618/1183453 """ from __future__ import print_function, division, unicode_literals, absolute_import -from builtins import open # Import packages from multiprocessing import Process, Pool, cpu_count, pool -import threading from traceback import format_exception import sys from copy import deepcopy import numpy as np -from ... import logging, config -from ...utils.misc import str2bool +from ... import logging +from ...utils.profiler import get_system_total_memory_gb from ..engine import MapNode -from .base import (DistributedPluginBase, report_crash) +from .base import DistributedPluginBase # Init logger logger = logging.getLogger('workflow') + # Run node def run_node(node, updatehash, taskid): """Function to execute node.run(), catch and log any errors and @@ -51,8 +50,7 @@ def run_node(node, updatehash, taskid): try: result['result'] = node.run(updatehash=updatehash) except: - etype, eval, etr = sys.exc_info() - result['traceback'] = format_exception(etype, eval, etr) + result['traceback'] = format_exception(*sys.exc_info()) result['result'] = node.result # Return the result dictionary @@ -77,36 +75,9 @@ class NonDaemonPool(pool.Pool): Process = NonDaemonProcess -# Get total system RAM -def get_system_total_memory_gb(): - """Function to get the total RAM of the running system in GB - """ - - # Import packages - import os - import sys - - # Get memory - if 'linux' in sys.platform: - with open('/proc/meminfo', 'r') as f_in: - meminfo_lines = f_in.readlines() - mem_total_line = [line for line in meminfo_lines \ - if 'MemTotal' in line][0] - mem_total = float(mem_total_line.split()[1]) - memory_gb = mem_total/(1024.0**2) - elif 'darwin' in sys.platform: - mem_str = os.popen('sysctl hw.memsize').read().strip().split(' ')[-1] - memory_gb = float(mem_str)/(1024.0**3) - else: - err_msg = 'System platform: %s is not supported' - raise Exception(err_msg) - - # Return memory - return memory_gb - - class MultiProcPlugin(DistributedPluginBase): - """Execute workflow with multiprocessing, not sending more jobs at once + """ + Execute workflow with multiprocessing, not sending more jobs at once than the system can support. The plugin_args input to run can be used to control the multiprocessing @@ -114,17 +85,25 @@ class MultiProcPlugin(DistributedPluginBase): should be used. When those parameters are not specified, the number of threads and memory of the system is used. - System consuming nodes should be tagged: - memory_consuming_node.interface.estimated_memory_gb = 8 - thread_consuming_node.interface.num_threads = 16 + System consuming nodes should be tagged:: - The default number of threads and memory for a node is 1. + memory_consuming_node.mem_gb = 8 + thread_consuming_node.n_procs = 16 + + The default number of threads and memory are set at node + creation, and are 1 and 0.25GB respectively. Currently supported options are: - non_daemon : boolean flag to execute as non-daemon processes - n_procs: maximum number of threads to be executed in parallel - memory_gb: maximum memory (in GB) that can be used at once. + - raise_insufficient: raise error if the requested resources for + a node over the maximum `n_procs` and/or `memory_gb` + (default is ``True``). + - scheduler: sort jobs topologically (``'tsort'``, default value) + or prioritize jobs by, first, memory consumption and, second, + number of threads (``'mem_thread'`` option). """ @@ -134,207 +113,177 @@ def __init__(self, plugin_args=None): self._taskresult = {} self._task_obj = {} self._taskid = 0 - non_daemon = True - self.plugin_args = plugin_args - self.processors = cpu_count() - self.memory_gb = get_system_total_memory_gb()*0.9 # 90% of system memory - - self._timeout=2.0 - self._event = threading.Event() - - - # Check plugin args - if self.plugin_args: - if 'non_daemon' in self.plugin_args: - non_daemon = plugin_args['non_daemon'] - if 'n_procs' in self.plugin_args: - self.processors = self.plugin_args['n_procs'] - if 'memory_gb' in self.plugin_args: - self.memory_gb = self.plugin_args['memory_gb'] - - logger.debug("MultiProcPlugin starting %d threads in pool"%(self.processors)) + # Read in options or set defaults. + non_daemon = self.plugin_args.get('non_daemon', True) + self.processors = self.plugin_args.get('n_procs', cpu_count()) + self.memory_gb = self.plugin_args.get('memory_gb', # Allocate 90% of system memory + get_system_total_memory_gb() * 0.9) + self.raise_insufficient = self.plugin_args.get('raise_insufficient', True) # Instantiate different thread pools for non-daemon processes - if non_daemon: - # run the execution using the non-daemon pool subclass - self.pool = NonDaemonPool(processes=self.processors) - else: - self.pool = Pool(processes=self.processors) - - def _wait(self): - if len(self.pending_tasks) > 0: - if self._config['execution']['poll_sleep_duration']: - self._timeout = float(self._config['execution']['poll_sleep_duration']) - sig_received=self._event.wait(self._timeout) - if not sig_received: - logger.debug('MultiProcPlugin timeout before signal received. Deadlock averted??') - self._event.clear() + logger.debug('MultiProcPlugin starting in "%sdaemon" mode (n_procs=%d, mem_gb=%0.2f)', + 'non' if non_daemon else '', self.processors, self.memory_gb) + self.pool = (NonDaemonPool if non_daemon else Pool)(processes=self.processors) def _async_callback(self, args): - self._taskresult[args['taskid']]=args - self._event.set() + self._taskresult[args['taskid']] = args def _get_result(self, taskid): - if taskid not in self._taskresult: - result=None - else: - result=self._taskresult[taskid] - return result - - def _report_crash(self, node, result=None): - if result and result['traceback']: - node._result = result['result'] - node._traceback = result['traceback'] - return report_crash(node, - traceback=result['traceback']) - else: - return report_crash(node) + return self._taskresult.get(taskid) def _clear_task(self, taskid): del self._task_obj[taskid] def _submit_job(self, node, updatehash=False): self._taskid += 1 - if hasattr(node.inputs, 'terminal_output'): - if node.inputs.terminal_output == 'stream': - node.inputs.terminal_output = 'allatonce' - - self._task_obj[self._taskid] = \ - self.pool.apply_async(run_node, - (node, updatehash, self._taskid), - callback=self._async_callback) + if getattr(node.inputs, 'terminal_output', '') == 'stream': + node.inputs.terminal_output = 'allatonce' + + self._task_obj[self._taskid] = self.pool.apply_async( + run_node, (node, updatehash, self._taskid), + callback=self._async_callback) + + logger.debug('MultiProc submitted task %s (taskid=%d).', + node.fullname, self._taskid) return self._taskid - def _close(self): + def _prerun_check(self, graph): + """Check if any node exeeds the available resources""" + tasks_mem_gb = [] + tasks_num_th = [] + for node in graph.nodes(): + tasks_mem_gb.append(node.mem_gb) + tasks_num_th.append(node.n_procs) + + if np.any(np.array(tasks_mem_gb) > self.memory_gb): + logger.warning( + 'Some nodes exceed the total amount of memory available ' + '(%0.2fGB).', self.memory_gb) + if self.raise_insufficient: + raise RuntimeError('Insufficient resources available for job') + + if np.any(np.array(tasks_num_th) > self.processors): + logger.warning( + 'Some nodes demand for more threads than available (%d).', + self.processors) + if self.raise_insufficient: + raise RuntimeError('Insufficient resources available for job') + + def _postrun_check(self): self.pool.close() - return True + + def _check_resources(self, running_tasks): + """ + Make sure there are resources available + """ + free_memory_gb = self.memory_gb + free_processors = self.processors + for _, jobid in running_tasks: + free_memory_gb -= min(self.procs[jobid].mem_gb, free_memory_gb) + free_processors -= min(self.procs[jobid].n_procs, free_processors) + + return free_memory_gb, free_processors def _send_procs_to_workers(self, updatehash=False, graph=None): - """ Sends jobs to workers when system resources are available. - Check memory (gb) and cores usage before running jobs. """ - executing_now = [] + Sends jobs to workers when system resources are available. + """ - # Check to see if a job is available - currently_running_jobids = np.flatnonzero((self.proc_pending == True) & \ - (self.depidx.sum(axis=0) == 0).__array__()) + # Check to see if a job is available (jobs without dependencies not run) + # See https://github.com/nipy/nipype/pull/2200#discussion_r141605722 + jobids = np.nonzero(~self.proc_done & (self.depidx.sum(0) == 0))[1] # Check available system resources by summing all threads and memory used - busy_memory_gb = 0 - busy_processors = 0 - for jobid in currently_running_jobids: - if self.procs[jobid]._interface.estimated_memory_gb <= self.memory_gb and \ - self.procs[jobid]._interface.num_threads <= self.processors: + free_memory_gb, free_processors = self._check_resources(self.pending_tasks) - busy_memory_gb += self.procs[jobid]._interface.estimated_memory_gb - busy_processors += self.procs[jobid]._interface.num_threads + logger.info('Currently running %d tasks, and %d jobs ready. ' + 'Free memory (GB): %0.2f/%0.2f, Free processors: %d/%d', + len(self.pending_tasks), len(jobids), + free_memory_gb, self.memory_gb, free_processors, self.processors) - else: - raise ValueError( - "Resources required by jobid {0} ({3}GB, {4} threads) exceed what is " - "available on the system ({1}GB, {2} threads)".format( - jobid, self.memory_gb, self.processors, - self.procs[jobid]._interface.estimated_memory_gb, - self.procs[jobid]._interface.num_threads)) - - free_memory_gb = self.memory_gb - busy_memory_gb - free_processors = self.processors - busy_processors - - # Check all jobs without dependency not run - jobids = np.flatnonzero((self.proc_done == False) & \ - (self.depidx.sum(axis=0) == 0).__array__()) - - # Sort jobs ready to run first by memory and then by number of threads - # The most resource consuming jobs run first - jobids = sorted(jobids, - key=lambda item: (self.procs[item]._interface.estimated_memory_gb, - self.procs[item]._interface.num_threads)) - - if str2bool(config.get('execution', 'profile_runtime')): - logger.debug('Free memory (GB): %d, Free processors: %d', - free_memory_gb, free_processors) + if free_memory_gb < 0.01 or free_processors == 0: + logger.debug('No resources available') + return + + if len(jobids) + len(self.pending_tasks) == 0: + logger.debug('No tasks are being run, and no jobs can ' + 'be submitted to the queue. Potential deadlock') + return + + jobids = self._sort_jobs(jobids, scheduler=self.plugin_args.get('scheduler')) - # While have enough memory and processors for first job - # Submit first job on the list + # Submit jobs for jobid in jobids: - if str2bool(config.get('execution', 'profile_runtime')): - logger.debug('Next Job: %d, memory (GB): %d, threads: %d' \ - % (jobid, - self.procs[jobid]._interface.estimated_memory_gb, - self.procs[jobid]._interface.num_threads)) - - if self.procs[jobid]._interface.estimated_memory_gb <= free_memory_gb and \ - self.procs[jobid]._interface.num_threads <= free_processors: - logger.info('Executing: %s ID: %d' %(self.procs[jobid]._id, jobid)) - executing_now.append(self.procs[jobid]) - - if isinstance(self.procs[jobid], MapNode): - try: - num_subnodes = self.procs[jobid].num_subnodes() - except Exception: - etype, eval, etr = sys.exc_info() - traceback = format_exception(etype, eval, etr) - report_crash(self.procs[jobid], traceback=traceback) - self._clean_queue(jobid, graph) - self.proc_pending[jobid] = False + # First expand mapnodes + if isinstance(self.procs[jobid], MapNode): + try: + num_subnodes = self.procs[jobid].num_subnodes() + except Exception: + traceback = format_exception(*sys.exc_info()) + self._report_crash(self.procs[jobid], traceback=traceback) + self._clean_queue(jobid, graph) + self.proc_pending[jobid] = False + continue + if num_subnodes > 1: + submit = self._submit_mapnode(jobid) + if not submit: continue - if num_subnodes > 1: - submit = self._submit_mapnode(jobid) - if not submit: - continue - - # change job status in appropriate queues - self.proc_done[jobid] = True - self.proc_pending[jobid] = True - - free_memory_gb -= self.procs[jobid]._interface.estimated_memory_gb - free_processors -= self.procs[jobid]._interface.num_threads - - # Send job to task manager and add to pending tasks - if self._status_callback: - self._status_callback(self.procs[jobid], 'start') - if str2bool(self.procs[jobid].config['execution']['local_hash_check']): - logger.debug('checking hash locally') - try: - hash_exists, _, _, _ = self.procs[ - jobid].hash_exists() - logger.debug('Hash exists %s' % str(hash_exists)) - if (hash_exists and (self.procs[jobid].overwrite == False or - (self.procs[jobid].overwrite == None and - not self.procs[jobid]._interface.always_run))): - self._task_finished_cb(jobid) - self._remove_node_dirs() - continue - except Exception: - etype, eval, etr = sys.exc_info() - traceback = format_exception(etype, eval, etr) - report_crash(self.procs[jobid], traceback=traceback) - self._clean_queue(jobid, graph) - self.proc_pending[jobid] = False - continue - logger.debug('Finished checking hash') - - if self.procs[jobid].run_without_submitting: - logger.debug('Running node %s on master thread' \ - % self.procs[jobid]) - try: - self.procs[jobid].run() - except Exception: - etype, eval, etr = sys.exc_info() - traceback = format_exception(etype, eval, etr) - report_crash(self.procs[jobid], traceback=traceback) - self._task_finished_cb(jobid) - self._remove_node_dirs() - - else: - logger.debug('MultiProcPlugin submitting %s' % str(jobid)) - tid = self._submit_job(deepcopy(self.procs[jobid]), - updatehash=updatehash) - if tid is None: - self.proc_done[jobid] = False - self.proc_pending[jobid] = False - else: - self.pending_tasks.insert(0, (tid, jobid)) + + # Check requirements of this job + next_job_gb = min(self.procs[jobid].mem_gb, self.memory_gb) + next_job_th = min(self.procs[jobid].n_procs, self.processors) + + # If node does not fit, skip at this moment + if next_job_th > free_processors or next_job_gb > free_memory_gb: + logger.debug('Cannot allocate job %d (%0.2fGB, %d threads).', + jobid, next_job_gb, next_job_th) + continue + + free_memory_gb -= next_job_gb + free_processors -= next_job_th + logger.debug('Allocating %s ID=%d (%0.2fGB, %d threads). Free: %0.2fGB, %d threads.', + self.procs[jobid].fullname, jobid, next_job_gb, next_job_th, + free_memory_gb, free_processors) + + # change job status in appropriate queues + self.proc_done[jobid] = True + self.proc_pending[jobid] = True + + # If cached just retrieve it, don't run + if self._local_hash_check(jobid, graph): + continue + + if self.procs[jobid].run_without_submitting: + logger.debug('Running node %s on master thread', + self.procs[jobid]) + try: + self.procs[jobid].run() + except Exception: + traceback = format_exception(*sys.exc_info()) + self._report_crash(self.procs[jobid], traceback=traceback) + + # Release resources + self._task_finished_cb(jobid) + self._remove_node_dirs() + free_memory_gb += next_job_gb + free_processors += next_job_th + continue + + # Task should be submitted to workers + # Send job to task manager and add to pending tasks + if self._status_callback: + self._status_callback(self.procs[jobid], 'start') + tid = self._submit_job(deepcopy(self.procs[jobid]), + updatehash=updatehash) + if tid is None: + self.proc_done[jobid] = False + self.proc_pending[jobid] = False else: - break + self.pending_tasks.insert(0, (tid, jobid)) + + def _sort_jobs(self, jobids, scheduler='tsort'): + if scheduler == 'mem_thread': + return sorted(jobids, key=lambda item: ( + self.procs[item].mem_gb, self.procs[item].n_procs)) + return jobids diff --git a/nipype/pipeline/plugins/oar.py b/nipype/pipeline/plugins/oar.py index ca77fade1e..e3f5ef7947 100644 --- a/nipype/pipeline/plugins/oar.py +++ b/nipype/pipeline/plugins/oar.py @@ -10,9 +10,10 @@ import subprocess import simplejson as json -from .base import (SGELikeBatchManagerBase, logger, iflogger, logging) +from ... import logging from ...interfaces.base import CommandLine - +from .base import SGELikeBatchManagerBase, logger +iflogger = logging.getLogger('interface') class OARPlugin(SGELikeBatchManagerBase): """Execute using OAR @@ -68,6 +69,7 @@ def _is_pending(self, taskid): def _submit_batchtask(self, scriptfile, node): cmd = CommandLine('oarsub', environ=dict(os.environ), + resource_monitor=False, terminal_output='allatonce') path = os.path.dirname(scriptfile) oarsubargs = '' diff --git a/nipype/pipeline/plugins/pbs.py b/nipype/pipeline/plugins/pbs.py index 5288bb36cb..6154abad74 100644 --- a/nipype/pipeline/plugins/pbs.py +++ b/nipype/pipeline/plugins/pbs.py @@ -7,9 +7,11 @@ import os from time import sleep +from ... import logging from ...interfaces.base import CommandLine -from .base import (SGELikeBatchManagerBase, logger, iflogger, logging) +from .base import SGELikeBatchManagerBase, logger +iflogger = logging.getLogger('interface') class PBSPlugin(SGELikeBatchManagerBase): @@ -48,6 +50,7 @@ def _is_pending(self, taskid): result = CommandLine('qstat {}'.format(taskid), environ=dict(os.environ), terminal_output='allatonce', + resource_monitor=False, ignore_exception=True).run() stderr = result.runtime.stderr errmsg = 'Unknown Job Id' # %s' % taskid @@ -59,6 +62,7 @@ def _is_pending(self, taskid): def _submit_batchtask(self, scriptfile, node): cmd = CommandLine('qsub', environ=dict(os.environ), + resource_monitor=False, terminal_output='allatonce') path = os.path.dirname(scriptfile) qsubargs = '' diff --git a/nipype/pipeline/plugins/pbsgraph.py b/nipype/pipeline/plugins/pbsgraph.py index 1aafd24e37..719b82578c 100644 --- a/nipype/pipeline/plugins/pbsgraph.py +++ b/nipype/pipeline/plugins/pbsgraph.py @@ -55,6 +55,7 @@ def _submit_graph(self, pyfiles, dependencies, nodes): qsub_args, batchscriptfile)) cmd = CommandLine('sh', environ=dict(os.environ), + resource_monitor=False, terminal_output='allatonce') cmd.inputs.args = '%s' % submitjobsfile cmd.run() diff --git a/nipype/pipeline/plugins/sge.py b/nipype/pipeline/plugins/sge.py index 268fecf2a9..6d448df3df 100644 --- a/nipype/pipeline/plugins/sge.py +++ b/nipype/pipeline/plugins/sge.py @@ -15,9 +15,10 @@ import random +from ... import logging from ...interfaces.base import CommandLine -from .base import (SGELikeBatchManagerBase, logger, iflogger, logging) - +from .base import SGELikeBatchManagerBase, logger +iflogger = logging.getLogger('interface') DEBUGGING_PREFIX = str(int(random.uniform(100, 999))) @@ -364,6 +365,7 @@ def _is_pending(self, taskid): def _submit_batchtask(self, scriptfile, node): cmd = CommandLine('qsub', environ=dict(os.environ), + resource_monitor=False, terminal_output='allatonce') path = os.path.dirname(scriptfile) qsubargs = '' diff --git a/nipype/pipeline/plugins/sgegraph.py b/nipype/pipeline/plugins/sgegraph.py index dd4b8076e8..882c455450 100644 --- a/nipype/pipeline/plugins/sgegraph.py +++ b/nipype/pipeline/plugins/sgegraph.py @@ -147,6 +147,7 @@ def make_job_name(jobnumber, nodeslist): batchscript=batchscriptfile) fp.writelines(full_line) cmd = CommandLine('bash', environ=dict(os.environ), + resource_monitor=False, terminal_output='allatonce') cmd.inputs.args = '%s' % submitjobsfile cmd.run() diff --git a/nipype/pipeline/plugins/slurm.py b/nipype/pipeline/plugins/slurm.py index 3f83772f6a..e5b797da5d 100644 --- a/nipype/pipeline/plugins/slurm.py +++ b/nipype/pipeline/plugins/slurm.py @@ -12,9 +12,11 @@ import re from time import sleep +from ... import logging from ...interfaces.base import CommandLine -from .base import (SGELikeBatchManagerBase, logger, iflogger, logging) +from .base import SGELikeBatchManagerBase, logger +iflogger = logging.getLogger('interface') class SLURMPlugin(SGELikeBatchManagerBase): @@ -62,6 +64,7 @@ def _is_pending(self, taskid): # subprocess.Popen requires taskid to be a string res = CommandLine('squeue', args=' '.join(['-j', '%s' % taskid]), + resource_monitor=False, terminal_output='allatonce').run() return res.runtime.stdout.find(str(taskid)) > -1 @@ -72,6 +75,7 @@ def _submit_batchtask(self, scriptfile, node): formatting/processing """ cmd = CommandLine('sbatch', environ=dict(os.environ), + resource_monitor=False, terminal_output='allatonce') path = os.path.dirname(scriptfile) diff --git a/nipype/pipeline/plugins/slurmgraph.py b/nipype/pipeline/plugins/slurmgraph.py index 794a35bc84..ed571ecffe 100644 --- a/nipype/pipeline/plugins/slurmgraph.py +++ b/nipype/pipeline/plugins/slurmgraph.py @@ -146,6 +146,7 @@ def make_job_name(jobnumber, nodeslist): batchscript=batchscriptfile) fp.writelines(full_line) cmd = CommandLine('bash', environ=dict(os.environ), + resource_monitor=False, terminal_output='allatonce') cmd.inputs.args = '%s' % submitjobsfile cmd.run() diff --git a/nipype/pipeline/plugins/tests/test_base.py b/nipype/pipeline/plugins/tests/test_base.py index 82a2a4480a..f8838e691a 100644 --- a/nipype/pipeline/plugins/tests/test_base.py +++ b/nipype/pipeline/plugins/tests/test_base.py @@ -5,12 +5,6 @@ """ import numpy as np import scipy.sparse as ssp -import re - -import mock - -import nipype.pipeline.plugins.base as pb - def test_scipy_sparse(): foo = ssp.lil_matrix(np.eye(3, k=1)) @@ -18,26 +12,6 @@ def test_scipy_sparse(): goo[goo.nonzero()] = 0 assert foo[0, 1] == 0 -def test_report_crash(): - with mock.patch('pickle.dump', mock.MagicMock()) as mock_pickle_dump: - with mock.patch('nipype.pipeline.plugins.base.format_exception', mock.MagicMock()): # see iss 1517 - mock_pickle_dump.return_value = True - mock_node = mock.MagicMock(name='mock_node') - mock_node._id = 'an_id' - mock_node.config = { - 'execution' : { - 'crashdump_dir' : '.', - 'crashfile_format' : 'pklz', - } - } - - actual_crashfile = pb.report_crash(mock_node) - - expected_crashfile = re.compile('.*/crash-.*-an_id-[0-9a-f\-]*.pklz') - - assert expected_crashfile.match(actual_crashfile).group() == actual_crashfile - assert mock_pickle_dump.call_count == 1 - ''' Can use the following code to test that a mapnode crash continues successfully Need to put this into a nose-test with a timeout diff --git a/nipype/pipeline/plugins/tests/test_callback.py b/nipype/pipeline/plugins/tests/test_callback.py index 822d13c5a9..bfe03463d1 100644 --- a/nipype/pipeline/plugins/tests/test_callback.py +++ b/nipype/pipeline/plugins/tests/test_callback.py @@ -81,13 +81,14 @@ def test_callback_multiproc_normal(tmpdir): assert so.statuses[1][1] == 'end' def test_callback_multiproc_exception(tmpdir): + tmpdir.chdir() + so = Status() - wf = pe.Workflow(name='test', base_dir=str(tmpdir)) + wf = pe.Workflow(name='test') f_node = pe.Node(niu.Function(function=bad_func, input_names=[], output_names=[]), name='f_node') wf.add_nodes([f_node]) - wf.config['execution']['crashdump_dir'] = wf.base_dir try: wf.run(plugin='MultiProc', plugin_args={'status_callback': so.callback}) diff --git a/nipype/pipeline/plugins/tests/test_debug.py b/nipype/pipeline/plugins/tests/test_debug.py index 2bd2003492..3e03abcf90 100644 --- a/nipype/pipeline/plugins/tests/test_debug.py +++ b/nipype/pipeline/plugins/tests/test_debug.py @@ -37,18 +37,21 @@ def test_debug(tmpdir): os.chdir(str(tmpdir)) pipe = pe.Workflow(name='pipe') - mod1 = pe.Node(interface=DebugTestInterface(), name='mod1') - mod2 = pe.MapNode(interface=DebugTestInterface(), - iterfield=['input1'], + mod1 = pe.Node(DebugTestInterface(), name='mod1') + mod2 = pe.MapNode(DebugTestInterface(), iterfield=['input1'], name='mod2') + pipe.connect([(mod1, mod2, [('output1', 'input1')])]) pipe.base_dir = os.getcwd() mod1.inputs.input1 = 1 + run_wf = lambda: pipe.run(plugin="Debug") with pytest.raises(ValueError): run_wf() + + exc = None try: pipe.run(plugin="Debug", plugin_args={'callable': callme}) - exception_raised = False - except Exception: - exception_raised = True - assert not exception_raised + except Exception as e: + exc = e + + assert exc is None, 'unexpected exception caught' diff --git a/nipype/pipeline/plugins/tests/test_multiproc.py b/nipype/pipeline/plugins/tests/test_multiproc.py index 6dab555a11..780763405c 100644 --- a/nipype/pipeline/plugins/tests/test_multiproc.py +++ b/nipype/pipeline/plugins/tests/test_multiproc.py @@ -1,14 +1,14 @@ # -*- coding: utf-8 -*- -import logging -import os, sys -from multiprocessing import cpu_count - -import nipype.interfaces.base as nib -from nipype.utils import draw_gantt_chart +# emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- +# vi: set ft=python sts=4 ts=4 sw=4 et: +""" +Test the resource management of MultiProc +""" +import os import pytest -import nipype.pipeline.engine as pe -from nipype.pipeline.plugins.callback_log import log_nodes_cb -from nipype.pipeline.plugins.multiproc import get_system_total_memory_gb +from nipype.pipeline import engine as pe +from nipype.interfaces import base as nib + class InputSpec(nib.TraitedSpec): input1 = nib.traits.Int(desc='a random int') @@ -32,12 +32,13 @@ def _list_outputs(self): outputs['output1'] = [1, self.inputs.input1] return outputs + def test_run_multiproc(tmpdir): - os.chdir(str(tmpdir)) + tmpdir.chdir() pipe = pe.Workflow(name='pipe') - mod1 = pe.Node(interface=MultiprocTestInterface(), name='mod1') - mod2 = pe.MapNode(interface=MultiprocTestInterface(), + mod1 = pe.Node(MultiprocTestInterface(), name='mod1') + mod2 = pe.MapNode(MultiprocTestInterface(), iterfield=['input1'], name='mod2') pipe.connect([(mod1, mod2, [('output1', 'input1')])]) @@ -45,7 +46,7 @@ def test_run_multiproc(tmpdir): mod1.inputs.input1 = 1 pipe.config['execution']['poll_sleep_duration'] = 2 execgraph = pipe.run(plugin="MultiProc") - names = ['.'.join((node._hierarchy, node.name)) for node in execgraph.nodes()] + names = [node.fullname for node in execgraph.nodes()] node = list(execgraph.nodes())[names.index('pipe.mod1')] result = node.get_output('output1') assert result == [1, 1] @@ -74,67 +75,13 @@ def _list_outputs(self): return outputs -def find_metrics(nodes, last_node): - """ - """ - - # Import packages - from dateutil.parser import parse - import datetime - - start = nodes[0]['start'] - total_duration = int((last_node['finish'] - start).total_seconds()) - - total_memory = [] - total_threads = [] - for i in range(total_duration): - total_memory.append(0) - total_threads.append(0) - - now = start - for i in range(total_duration): - start_index = 0 - node_start = None - node_finish = None - - x = now - - for j in range(start_index, len(nodes)): - node_start = nodes[j]['start'] - node_finish = nodes[j]['finish'] - - if node_start < x and node_finish > x: - total_memory[i] += float(nodes[j]['estimated_memory_gb']) - total_threads[i] += int(nodes[j]['num_threads']) - start_index = j - - if node_start > x: - break - - now += datetime.timedelta(seconds=1) - - return total_memory, total_threads - -def test_no_more_memory_than_specified(): - LOG_FILENAME = 'callback.log' - my_logger = logging.getLogger('callback') - my_logger.setLevel(logging.DEBUG) - - # Add the log message handler to the logger - handler = logging.FileHandler(LOG_FILENAME) - my_logger.addHandler(handler) - - max_memory = 1 +def test_no_more_memory_than_specified(tmpdir): + tmpdir.chdir() pipe = pe.Workflow(name='pipe') - n1 = pe.Node(interface=SingleNodeTestInterface(), name='n1') - n2 = pe.Node(interface=SingleNodeTestInterface(), name='n2') - n3 = pe.Node(interface=SingleNodeTestInterface(), name='n3') - n4 = pe.Node(interface=SingleNodeTestInterface(), name='n4') - - n1.interface.estimated_memory_gb = 1 - n2.interface.estimated_memory_gb = 1 - n3.interface.estimated_memory_gb = 1 - n4.interface.estimated_memory_gb = 1 + n1 = pe.Node(SingleNodeTestInterface(), name='n1', mem_gb=1) + n2 = pe.Node(SingleNodeTestInterface(), name='n2', mem_gb=1) + n3 = pe.Node(SingleNodeTestInterface(), name='n3', mem_gb=1) + n4 = pe.Node(SingleNodeTestInterface(), name='n4', mem_gb=1) pipe.connect(n1, 'output1', n2, 'input1') pipe.connect(n1, 'output1', n3, 'input1') @@ -142,87 +89,49 @@ def test_no_more_memory_than_specified(): pipe.connect(n3, 'output1', n4, 'input2') n1.inputs.input1 = 1 - pipe.run(plugin='MultiProc', - plugin_args={'memory_gb': max_memory, - 'status_callback': log_nodes_cb}) - + max_memory = 0.5 + with pytest.raises(RuntimeError): + pipe.run(plugin='MultiProc', + plugin_args={'memory_gb': max_memory, + 'n_procs': 2}) - nodes = draw_gantt_chart.log_to_dict(LOG_FILENAME) - last_node = nodes[-1] - #usage in every second - memory, threads = find_metrics(nodes, last_node) - result = True - for m in memory: - if m > max_memory: - result = False - break +def test_no_more_threads_than_specified(tmpdir): + tmpdir.chdir() - assert result - - max_threads = cpu_count() - - result = True - for t in threads: - if t > max_threads: - result = False - break + pipe = pe.Workflow(name='pipe') + n1 = pe.Node(SingleNodeTestInterface(), name='n1', n_procs=2) + n2 = pe.Node(SingleNodeTestInterface(), name='n2', n_procs=2) + n3 = pe.Node(SingleNodeTestInterface(), name='n3', n_procs=4) + n4 = pe.Node(SingleNodeTestInterface(), name='n4', n_procs=2) - assert result,\ - "using more threads than system has (threads is not specified by user)" + pipe.connect(n1, 'output1', n2, 'input1') + pipe.connect(n1, 'output1', n3, 'input1') + pipe.connect(n2, 'output1', n4, 'input1') + pipe.connect(n3, 'output1', n4, 'input2') + n1.inputs.input1 = 4 - os.remove(LOG_FILENAME) + max_threads = 2 + with pytest.raises(RuntimeError): + pipe.run(plugin='MultiProc', + plugin_args={'n_procs': max_threads}) -def test_no_more_threads_than_specified(): - LOG_FILENAME = 'callback.log' - my_logger = logging.getLogger('callback') - my_logger.setLevel(logging.DEBUG) - # Add the log message handler to the logger - handler = logging.FileHandler(LOG_FILENAME) - my_logger.addHandler(handler) +def test_hold_job_until_procs_available(tmpdir): + tmpdir.chdir() - max_threads = 4 pipe = pe.Workflow(name='pipe') - n1 = pe.Node(interface=SingleNodeTestInterface(), name='n1') - n2 = pe.Node(interface=SingleNodeTestInterface(), name='n2') - n3 = pe.Node(interface=SingleNodeTestInterface(), name='n3') - n4 = pe.Node(interface=SingleNodeTestInterface(), name='n4') - - n1.interface.num_threads = 1 - n2.interface.num_threads = 1 - n3.interface.num_threads = 4 - n4.interface.num_threads = 1 + n1 = pe.Node(SingleNodeTestInterface(), name='n1', n_procs=2) + n2 = pe.Node(SingleNodeTestInterface(), name='n2', n_procs=2) + n3 = pe.Node(SingleNodeTestInterface(), name='n3', n_procs=2) + n4 = pe.Node(SingleNodeTestInterface(), name='n4', n_procs=2) pipe.connect(n1, 'output1', n2, 'input1') pipe.connect(n1, 'output1', n3, 'input1') pipe.connect(n2, 'output1', n4, 'input1') pipe.connect(n3, 'output1', n4, 'input2') n1.inputs.input1 = 4 - pipe.config['execution']['poll_sleep_duration'] = 1 - pipe.run(plugin='MultiProc', plugin_args={'n_procs': max_threads, - 'status_callback': log_nodes_cb}) - - nodes = draw_gantt_chart.log_to_dict(LOG_FILENAME) - last_node = nodes[-1] - #usage in every second - memory, threads = find_metrics(nodes, last_node) - - result = True - for t in threads: - if t > max_threads: - result = False - break - - assert result, "using more threads than specified" - - max_memory = get_system_total_memory_gb() - result = True - for m in memory: - if m > max_memory: - result = False - break - assert result,\ - "using more memory than system has (memory is not specified by user)" - - os.remove(LOG_FILENAME) + + max_threads = 2 + pipe.run(plugin='MultiProc', + plugin_args={'n_procs': max_threads}) diff --git a/nipype/pipeline/plugins/tests/test_tools.py b/nipype/pipeline/plugins/tests/test_tools.py new file mode 100644 index 0000000000..479cc773df --- /dev/null +++ b/nipype/pipeline/plugins/tests/test_tools.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- +# emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- +# vi: set ft=python sts=4 ts=4 sw=4 et: +"""Tests for the engine module +""" +import numpy as np +import scipy.sparse as ssp +import re + +import mock + +from nipype.pipeline.plugins.tools import report_crash + +def test_report_crash(): + with mock.patch('pickle.dump', mock.MagicMock()) as mock_pickle_dump: + with mock.patch('nipype.pipeline.plugins.tools.format_exception', mock.MagicMock()): # see iss 1517 + mock_pickle_dump.return_value = True + mock_node = mock.MagicMock(name='mock_node') + mock_node._id = 'an_id' + mock_node.config = { + 'execution' : { + 'crashdump_dir' : '.', + 'crashfile_format' : 'pklz', + } + } + + actual_crashfile = report_crash(mock_node) + + expected_crashfile = re.compile('.*/crash-.*-an_id-[0-9a-f\-]*.pklz') + + assert expected_crashfile.match(actual_crashfile).group() == actual_crashfile + assert mock_pickle_dump.call_count == 1 + +''' +Can use the following code to test that a mapnode crash continues successfully +Need to put this into a nose-test with a timeout + +import nipype.interfaces.utility as niu +import nipype.pipeline.engine as pe + +wf = pe.Workflow(name='test') + +def func(arg1): + if arg1 == 2: + raise Exception('arg cannot be ' + str(arg1)) + return arg1 + +funkynode = pe.MapNode(niu.Function(function=func, input_names=['arg1'], output_names=['out']), + iterfield=['arg1'], + name = 'functor') +funkynode.inputs.arg1 = [1,2] + +wf.add_nodes([funkynode]) +wf.base_dir = '/tmp' + +wf.run(plugin='MultiProc') +''' diff --git a/nipype/pipeline/plugins/tools.py b/nipype/pipeline/plugins/tools.py new file mode 100644 index 0000000000..499a1db2d7 --- /dev/null +++ b/nipype/pipeline/plugins/tools.py @@ -0,0 +1,163 @@ +# -*- coding: utf-8 -*- +# emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- +# vi: set ft=python sts=4 ts=4 sw=4 et: +"""Common graph operations for execution +""" +from __future__ import print_function, division, unicode_literals, absolute_import +from builtins import open + +import os +import getpass +from socket import gethostname +import sys +import uuid +from time import strftime +from traceback import format_exception + +from ... import logging +from ...utils.filemanip import savepkl, crash2txt + +logger = logging.getLogger('workflow') + + +def report_crash(node, traceback=None, hostname=None): + """Writes crash related information to a file + """ + name = node._id + host = None + if node.result and getattr(node.result, 'runtime'): + if isinstance(node.result.runtime, list): + host = node.result.runtime[0].hostname + else: + host = node.result.runtime.hostname + + # Try everything to fill in the host + host = host or hostname or gethostname() + logger.error('Node %s failed to run on host %s.', name, host) + if not traceback: + traceback = format_exception(*sys.exc_info()) + timeofcrash = strftime('%Y%m%d-%H%M%S') + login_name = getpass.getuser() + crashfile = 'crash-%s-%s-%s-%s' % ( + timeofcrash, login_name, name, str(uuid.uuid4())) + crashdir = node.config['execution'].get('crashdump_dir', os.getcwd()) + + if not os.path.exists(crashdir): + os.makedirs(crashdir) + crashfile = os.path.join(crashdir, crashfile) + + if node.config['execution']['crashfile_format'].lower() in ['text', 'txt']: + crashfile += '.txt' + else: + crashfile += '.pklz' + + logger.error('Saving crash info to %s\n%s', crashfile, ''.join(traceback)) + if crashfile.endswith('.txt'): + crash2txt(crashfile, dict(node=node, traceback=traceback)) + else: + savepkl(crashfile, dict(node=node, traceback=traceback)) + return crashfile + + +def report_nodes_not_run(notrun): + """List nodes that crashed with crashfile info + + Optionally displays dependent nodes that weren't executed as a result of + the crash. + """ + if notrun: + logger.info("***********************************") + for info in notrun: + logger.error("could not run node: %s" % + '.'.join((info['node']._hierarchy, + info['node']._id))) + logger.info("crashfile: %s" % info['crashfile']) + logger.debug("The following dependent nodes were not run") + for subnode in info['dependents']: + logger.debug(subnode._id) + logger.info("***********************************") + raise RuntimeError(('Workflow did not execute cleanly. ' + 'Check log for details')) + + +def create_pyscript(node, updatehash=False, store_exception=True): + # pickle node + timestamp = strftime('%Y%m%d_%H%M%S') + if node._hierarchy: + suffix = '%s_%s_%s' % (timestamp, node._hierarchy, node._id) + batch_dir = os.path.join(node.base_dir, + node._hierarchy.split('.')[0], + 'batch') + else: + suffix = '%s_%s' % (timestamp, node._id) + batch_dir = os.path.join(node.base_dir, 'batch') + if not os.path.exists(batch_dir): + os.makedirs(batch_dir) + pkl_file = os.path.join(batch_dir, 'node_%s.pklz' % suffix) + savepkl(pkl_file, dict(node=node, updatehash=updatehash)) + mpl_backend = node.config["execution"]["matplotlib_backend"] + # create python script to load and trap exception + cmdstr = """import os +import sys + +can_import_matplotlib = True #Silently allow matplotlib to be ignored +try: + import matplotlib + matplotlib.use('%s') +except ImportError: + can_import_matplotlib = False + pass + +from nipype import config, logging +from nipype.utils.filemanip import loadpkl, savepkl +from socket import gethostname +from traceback import format_exception +info = None +pklfile = '%s' +batchdir = '%s' +from nipype.utils.filemanip import loadpkl, savepkl +try: + if not sys.version_info < (2, 7): + from collections import OrderedDict + config_dict=%s + config.update_config(config_dict) + ## Only configure matplotlib if it was successfully imported, + ## matplotlib is an optional component to nipype + if can_import_matplotlib: + config.update_matplotlib() + logging.update_logging(config) + traceback=None + cwd = os.getcwd() + info = loadpkl(pklfile) + result = info['node'].run(updatehash=info['updatehash']) +except Exception as e: + etype, eval, etr = sys.exc_info() + traceback = format_exception(etype,eval,etr) + if info is None or not os.path.exists(info['node'].output_dir()): + result = None + resultsfile = os.path.join(batchdir, 'crashdump_%s.pklz') + else: + result = info['node'].result + resultsfile = os.path.join(info['node'].output_dir(), + 'result_%%s.pklz'%%info['node'].name) +""" + if store_exception: + cmdstr += """ + savepkl(resultsfile, dict(result=result, hostname=gethostname(), + traceback=traceback)) +""" + else: + cmdstr += """ + if info is None: + savepkl(resultsfile, dict(result=result, hostname=gethostname(), + traceback=traceback)) + else: + from nipype.pipeline.plugins.base import report_crash + report_crash(info['node'], traceback, gethostname()) + raise Exception(e) +""" + cmdstr = cmdstr % (mpl_backend, pkl_file, batch_dir, node.config, suffix) + pyscript = os.path.join(batch_dir, 'pyscript_%s.py' % suffix) + with open(pyscript, 'wt') as fp: + fp.writelines(cmdstr) + return pyscript diff --git a/nipype/utils/config.py b/nipype/utils/config.py index ebea9e5816..2442f1eb01 100644 --- a/nipype/utils/config.py +++ b/nipype/utils/config.py @@ -11,21 +11,27 @@ ''' from __future__ import print_function, division, unicode_literals, absolute_import import os -import shutil import errno from warnings import warn from io import StringIO from distutils.version import LooseVersion -from simplejson import load, dump +import configparser import numpy as np -from builtins import str, object, open +from builtins import bytes, str, object, open + +from simplejson import load, dump from future import standard_library +from ..external import portalocker +from .misc import str2bool + standard_library.install_aliases() -import configparser -from ..external import portalocker +CONFIG_DEPRECATIONS = { + 'profile_runtime': ('resource_monitor', '1.0'), + 'filemanip_level': ('utils_level', '1.0'), +} NUMPY_MMAP = LooseVersion(np.__version__) >= LooseVersion('1.12.0') @@ -34,7 +40,7 @@ default_cfg = """ [logging] workflow_level = INFO -filemanip_level = INFO +utils_level = INFO interface_level = INFO log_to_file = false log_directory = %s @@ -64,7 +70,8 @@ parameterize_dirs = true poll_sleep_duration = 2 xvfb_max_wait = 10 -profile_runtime = false +resource_monitor = false +resource_monitor_frequency = 1 [check] interval = 1209600 @@ -91,9 +98,18 @@ def __init__(self, *args, **kwargs): config_file = os.path.join(config_dir, 'nipype.cfg') self.data_file = os.path.join(config_dir, 'nipype.json') self._config.readfp(StringIO(default_cfg)) + self._resource_monitor = None if os.path.exists(config_dir): self._config.read([config_file, 'nipype.cfg']) + for option in CONFIG_DEPRECATIONS: + for section in ['execution', 'logging']: + if self.has_option(section, option): + new_option = CONFIG_DEPRECATIONS[option][0] + if not self.has_option(section, new_option): + # Warn implicit in get + self.set(section, new_option, self.get(section, option)) + def set_default_config(self): self._config.readfp(StringIO(default_cfg)) @@ -114,13 +130,29 @@ def set_log_dir(self, log_dir): """ self._config.set('logging', 'log_directory', log_dir) - def get(self, section, option): - return self._config.get(section, option) + def get(self, section, option, default=None): + if option in CONFIG_DEPRECATIONS: + msg = ('Config option "%s" has been deprecated as of nipype %s. Please use ' + '"%s" instead.') % (option, CONFIG_DEPRECATIONS[option][1], + CONFIG_DEPRECATIONS[option][0]) + warn(msg) + option = CONFIG_DEPRECATIONS[option][0] + + if self._config.has_option(section, option): + return self._config.get(section, option) + return default def set(self, section, option, value): if isinstance(value, bool): value = str(value) + if option in CONFIG_DEPRECATIONS: + msg = ('Config option "%s" has been deprecated as of nipype %s. Please use ' + '"%s" instead.') % (option, CONFIG_DEPRECATIONS[option][1], + CONFIG_DEPRECATIONS[option][0]) + warn(msg) + option = CONFIG_DEPRECATIONS[option][0] + return self._config.set(section, option, value) def getboolean(self, section, option): @@ -172,3 +204,42 @@ def update_matplotlib(self): def enable_provenance(self): self._config.set('execution', 'write_provenance', 'true') self._config.set('execution', 'hash_method', 'content') + + @property + def resource_monitor(self): + """Check if resource_monitor is available""" + if self._resource_monitor is not None: + return self._resource_monitor + + # Cache config from nipype config + self.resource_monitor = self._config.get( + 'execution', 'resource_monitor') or False + return self._resource_monitor + + @resource_monitor.setter + def resource_monitor(self, value): + # Accept string true/false values + if isinstance(value, (str, bytes)): + value = str2bool(value.lower()) + + if value is False: + self._resource_monitor = False + elif value is True: + if not self._resource_monitor: + # Before setting self._resource_monitor check psutil availability + self._resource_monitor = False + try: + import psutil + self._resource_monitor = LooseVersion( + psutil.__version__) >= LooseVersion('5.0') + except ImportError: + pass + finally: + if not self._resource_monitor: + warn('Could not enable the resource monitor: psutil>=5.0' + ' could not be imported.') + self._config.set('execution', 'resource_monitor', + ('%s' % self._resource_monitor).lower()) + + def enable_resource_monitor(self): + self.resource_monitor = True diff --git a/nipype/utils/docparse.py b/nipype/utils/docparse.py index ebf52d06d3..0d6bce7d45 100644 --- a/nipype/utils/docparse.py +++ b/nipype/utils/docparse.py @@ -254,6 +254,7 @@ def get_doc(cmd, opt_map, help_flag=None, trap_error=True): """ res = CommandLine('which %s' % cmd.split(' ')[0], + resource_monitor=False, terminal_output='allatonce').run() cmd_path = res.runtime.stdout.strip() if cmd_path == '': @@ -330,6 +331,7 @@ def get_params_from_doc(cmd, style='--', help_flag=None, trap_error=True): """ res = CommandLine('which %s' % cmd.split(' ')[0], + resource_monitor=False, terminal_output='allatonce').run() cmd_path = res.runtime.stdout.strip() if cmd_path == '': diff --git a/nipype/utils/draw_gantt_chart.py b/nipype/utils/draw_gantt_chart.py index 4c18a66f8f..8731aa32eb 100644 --- a/nipype/utils/draw_gantt_chart.py +++ b/nipype/utils/draw_gantt_chart.py @@ -1,8 +1,9 @@ # -*- coding: utf-8 -*- # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- # vi: set ft=python sts=4 ts=4 sw=4 et: -"""Module to draw an html gantt chart from logfile produced by -callback_log.log_nodes_cb() +""" +Module to draw an html gantt chart from logfile produced by +``nipype.utils.profiler.log_nodes_cb()`` """ from __future__ import print_function, division, unicode_literals, absolute_import @@ -11,7 +12,6 @@ import random import datetime import simplejson as json -from dateutil import parser from builtins import str, range, open # Py2 compat: http://python-future.org/compatible_idioms.html#collections-counter-and-ordereddict from future import standard_library @@ -102,61 +102,14 @@ def log_to_dict(logfile): ''' # Init variables - #keep track of important vars - nodes_list = [] #all the parsed nodes - unifinished_nodes = [] #all start nodes that dont have a finish yet - with open(logfile, 'r') as content: - #read file separating each line - content = content.read() - lines = content.split('\n') - - for l in lines: - #try to parse each line and transform in a json dict. - #if the line has a bad format, just skip - node = None - try: - node = json.loads(l) - except ValueError: - pass - - if not node: - continue - - #if it is a start node, add to unifinished nodes - if 'start' in node: - node['start'] = parser.parse(node['start']) - unifinished_nodes.append(node) - - #if it is end node, look in uninished nodes for matching start - #remove from unifinished list and add to node list - elif 'finish' in node: - node['finish'] = parser.parse(node['finish']) - #because most nodes are small, we look backwards in the unfinished list - for s in range(len(unifinished_nodes)): - aux = unifinished_nodes[s] - #found the end for node start, copy over info - if aux['id'] == node['id'] and aux['name'] == node['name'] \ - and aux['start'] < node['finish']: - node['start'] = aux['start'] - node['duration'] = \ - (node['finish'] - node['start']).total_seconds() - - unifinished_nodes.remove(aux) - nodes_list.append(node) - break - - #finished parsing - #assume nodes without finish didn't finish running. - #set their finish to last node run - last_node = nodes_list[-1] - for n in unifinished_nodes: - n['finish'] = last_node['finish'] - n['duration'] = (n['finish'] - n['start']).total_seconds() - nodes_list.append(n) - - # Return list of nodes - return nodes_list + # read file separating each line + lines = content.readlines() + + nodes_list = [json.loads(l) for l in lines] + + # Return list of nodes + return nodes_list def calculate_resource_timeseries(events, resource): @@ -453,7 +406,7 @@ def generate_gantt_chart(logfile, cores, minute_scale=10, ----- # import logging # import logging.handlers - # from nipype.pipeline.plugins.callback_log import log_nodes_cb + # from nipype.utils.profiler import log_nodes_cb # log_filename = 'callback.log' # logger = logging.getLogger('callback') diff --git a/nipype/utils/filemanip.py b/nipype/utils/filemanip.py index 59b269e943..e321a597a6 100644 --- a/nipype/utils/filemanip.py +++ b/nipype/utils/filemanip.py @@ -27,7 +27,7 @@ from .misc import is_container from ..interfaces.traits_extension import isdefined -fmlogger = logging.getLogger("filemanip") +fmlogger = logging.getLogger('utils') related_filetype_sets = [ diff --git a/nipype/utils/logger.py b/nipype/utils/logger.py index b30b50bc72..4604cc4145 100644 --- a/nipype/utils/logger.py +++ b/nipype/utils/logger.py @@ -6,6 +6,7 @@ # vi: set ft=python sts=4 ts=4 sw=4 et: import logging +from warnings import warn import os import sys from .misc import str2bool @@ -15,7 +16,6 @@ RFHandler except ImportError: # Next 2 lines are optional: issue a warning to the user - from warnings import warn warn("ConcurrentLogHandler not installed. Using builtin log handler") from logging.handlers import RotatingFileHandler as RFHandler @@ -33,10 +33,12 @@ def __init__(self, config): stream=sys.stdout) # logging.basicConfig(stream=sys.stdout) self._logger = logging.getLogger('workflow') + self._utlogger = logging.getLogger('utils') self._fmlogger = logging.getLogger('filemanip') self._iflogger = logging.getLogger('interface') self.loggers = {'workflow': self._logger, + 'utils': self._utlogger, 'filemanip': self._fmlogger, 'interface': self._iflogger} self._hdlr = None @@ -53,15 +55,17 @@ def enable_file_logging(self): formatter = logging.Formatter(fmt=self.fmt, datefmt=self.datefmt) hdlr.setFormatter(formatter) self._logger.addHandler(hdlr) - self._fmlogger.addHandler(hdlr) + self._utlogger.addHandler(hdlr) self._iflogger.addHandler(hdlr) + self._fmlogger.addHandler(hdlr) self._hdlr = hdlr def disable_file_logging(self): if self._hdlr: self._logger.removeHandler(self._hdlr) - self._fmlogger.removeHandler(self._hdlr) + self._utlogger.removeHandler(self._hdlr) self._iflogger.removeHandler(self._hdlr) + self._fmlogger.removeHandler(self._hdlr) self._hdlr = None def update_logging(self, config): @@ -69,14 +73,17 @@ def update_logging(self, config): self.disable_file_logging() self._logger.setLevel(logging.getLevelName(config.get('logging', 'workflow_level'))) - self._fmlogger.setLevel(logging.getLevelName(config.get('logging', - 'filemanip_level'))) + self._utlogger.setLevel(logging.getLevelName(config.get('logging', + 'utils_level'))) self._iflogger.setLevel(logging.getLevelName(config.get('logging', 'interface_level'))) if str2bool(config.get('logging', 'log_to_file')): self.enable_file_logging() def getLogger(self, name): + if name == 'filemanip': + warn('The "filemanip" logger has been deprecated and replaced by ' + 'the "utils" logger as of nipype 1.0') if name in self.loggers: return self.loggers[name] return None diff --git a/nipype/utils/profiler.py b/nipype/utils/profiler.py new file mode 100644 index 0000000000..7dd1823d43 --- /dev/null +++ b/nipype/utils/profiler.py @@ -0,0 +1,361 @@ +# -*- coding: utf-8 -*- +# @Author: oesteban +# @Date: 2017-09-21 15:50:37 +# @Last Modified by: oesteban +# @Last Modified time: 2017-10-02 15:44:29 +""" +Utilities to keep track of performance +""" +from __future__ import print_function, division, unicode_literals, absolute_import + +import threading +from time import time +try: + import psutil +except ImportError as exc: + psutil = None + +from builtins import open, range +from .. import config, logging + +proflogger = logging.getLogger('utils') +resource_monitor = config.resource_monitor + +# Init variables +_MB = 1024.0**2 + + +class ResourceMonitor(threading.Thread): + """ + A ``Thread`` to monitor a specific PID with a certain frequence + to a file + """ + + def __init__(self, pid, freq=5, fname=None, python=True): + # Make sure psutil is imported + import psutil + + if freq < 0.2: + raise RuntimeError('Frequency (%0.2fs) cannot be lower than 0.2s' % freq) + + if fname is None: + fname = '.proc-%d_time-%s_freq-%0.2f' % (pid, time(), freq) + self._fname = fname + self._logfile = open(self._fname, 'w') + self._freq = freq + self._python = python + + # Leave process initialized and make first sample + self._process = psutil.Process(pid) + self._sample(cpu_interval=0.2) + + # Start thread + threading.Thread.__init__(self) + self._event = threading.Event() + + @property + def fname(self): + """Get/set the internal filename""" + return self._fname + + def stop(self): + """Stop monitoring""" + if not self._event.is_set(): + self._event.set() + self.join() + self._sample() + self._logfile.flush() + self._logfile.close() + + def _sample(self, cpu_interval=None): + cpu = 0.0 + mem = 0.0 + try: + with self._process.oneshot(): + cpu += self._process.cpu_percent(interval=cpu_interval) + mem += self._process.memory_info().rss + except psutil.NoSuchProcess: + pass + + # parent_mem = mem + # Iterate through child processes and get number of their threads + for child in self._process.children(recursive=True): + try: + with child.oneshot(): + cpu += child.cpu_percent() + mem += child.memory_info().rss + except psutil.NoSuchProcess: + pass + + print('%f,%f,%f' % (time(), (mem / _MB), cpu), + file=self._logfile) + self._logfile.flush() + + def run(self): + """Core monitoring function, called by start()""" + while not self._event.is_set(): + self._sample() + self._event.wait(self._freq) + + +# Log node stats function +def log_nodes_cb(node, status): + """Function to record node run statistics to a log file as json + dictionaries + + Parameters + ---------- + node : nipype.pipeline.engine.Node + the node being logged + status : string + acceptable values are 'start', 'end'; otherwise it is + considered and error + + Returns + ------- + None + this function does not return any values, it logs the node + status info to the callback logger + """ + + if status != 'end': + return + + # Import packages + import logging + import json + + status_dict = { + 'name': node.name, + 'id': node._id, + 'start': getattr(node.result.runtime, 'startTime'), + 'finish': getattr(node.result.runtime, 'endTime'), + 'duration': getattr(node.result.runtime, 'duration'), + 'runtime_threads': getattr( + node.result.runtime, 'cpu_percent', 'N/A'), + 'runtime_memory_gb': getattr( + node.result.runtime, 'mem_peak_gb', 'N/A'), + 'estimated_memory_gb': node.mem_gb, + 'num_threads': node.n_procs, + } + + if status_dict['start'] is None or status_dict['finish'] is None: + status_dict['error'] = True + + # Dump string to log + logging.getLogger('callback').debug(json.dumps(status_dict)) + + +# Get total system RAM +def get_system_total_memory_gb(): + """ + Function to get the total RAM of the running system in GB + """ + + # Import packages + import os + import sys + + # Get memory + if 'linux' in sys.platform: + with open('/proc/meminfo', 'r') as f_in: + meminfo_lines = f_in.readlines() + mem_total_line = [line for line in meminfo_lines + if 'MemTotal' in line][0] + mem_total = float(mem_total_line.split()[1]) + memory_gb = mem_total / (1024.0**2) + elif 'darwin' in sys.platform: + mem_str = os.popen('sysctl hw.memsize').read().strip().split(' ')[-1] + memory_gb = float(mem_str) / (1024.0**3) + else: + err_msg = 'System platform: %s is not supported' + raise Exception(err_msg) + + # Return memory + return memory_gb + + +# Get max resources used for process +def get_max_resources_used(pid, mem_mb, num_threads, pyfunc=False): + """ + Function to get the RAM and threads utilized by a given process + + Parameters + ---------- + pid : integer + the process ID of process to profile + mem_mb : float + the high memory watermark so far during process execution (in MB) + num_threads: int + the high thread watermark so far during process execution + + Returns + ------- + mem_mb : float + the new high memory watermark of process (MB) + num_threads : float + the new high thread watermark of process + """ + + if not resource_monitor: + raise RuntimeError('Attempted to measure resources with ' + '"resource_monitor" set off.') + + try: + mem_mb = max(mem_mb, _get_ram_mb(pid, pyfunc=pyfunc)) + num_threads = max(num_threads, _get_num_threads(pid)) + except Exception as exc: + proflogger.info('Could not get resources used by process.\n%s', exc) + + return mem_mb, num_threads + + +# Get number of threads for process +def _get_num_threads(pid): + """ + Function to get the number of threads a process is using + + Parameters + ---------- + pid : integer + the process ID of process to profile + + Returns + ------- + num_threads : int + the number of threads that the process is using + + """ + + try: + proc = psutil.Process(pid) + # If process is running + if proc.status() == psutil.STATUS_RUNNING: + num_threads = proc.num_threads() + elif proc.num_threads() > 1: + tprocs = [psutil.Process(thr.id) for thr in proc.threads()] + alive_tprocs = [tproc for tproc in tprocs if tproc.status() == psutil.STATUS_RUNNING] + num_threads = len(alive_tprocs) + else: + num_threads = 1 + + child_threads = 0 + # Iterate through child processes and get number of their threads + for child in proc.children(recursive=True): + # Leaf process + if len(child.children()) == 0: + # If process is running, get its number of threads + if child.status() == psutil.STATUS_RUNNING: + child_thr = child.num_threads() + # If its not necessarily running, but still multi-threaded + elif child.num_threads() > 1: + # Cast each thread as a process and check for only running + tprocs = [psutil.Process(thr.id) for thr in child.threads()] + alive_tprocs = [tproc for tproc in tprocs + if tproc.status() == psutil.STATUS_RUNNING] + child_thr = len(alive_tprocs) + # Otherwise, no threads are running + else: + child_thr = 0 + # Increment child threads + child_threads += child_thr + except psutil.NoSuchProcess: + return None + + # Number of threads is max between found active children and parent + num_threads = max(child_threads, num_threads) + + # Return number of threads found + return num_threads + + +# Get ram usage of process +def _get_ram_mb(pid, pyfunc=False): + """ + Function to get the RAM usage of a process and its children + Reference: http://ftp.dev411.com/t/python/python-list/095thexx8g/\ +multiprocessing-forking-memory-usage + + Parameters + ---------- + pid : integer + the PID of the process to get RAM usage of + pyfunc : boolean (optional); default=False + a flag to indicate if the process is a python function; + when Pythons are multithreaded via multiprocess or threading, + children functions include their own memory + parents. if this + is set, the parent memory will removed from children memories + + + Returns + ------- + mem_mb : float + the memory RAM in MB utilized by the process PID + + """ + try: + # Init parent + parent = psutil.Process(pid) + # Get memory of parent + parent_mem = parent.memory_info().rss + mem_mb = parent_mem / _MB + # Iterate through child processes + for child in parent.children(recursive=True): + child_mem = child.memory_info().rss + if pyfunc: + child_mem -= parent_mem + mem_mb += child_mem / _MB + except psutil.NoSuchProcess: + return None + + # Return memory + return mem_mb + + +def _use_cpu(x): + ctr = 0 + while ctr < 1e7: + ctr += 1 + x*x + +# Spin multiple threads +def _use_resources(n_procs, mem_gb): + """ + Function to execute multiple use_gb_ram functions in parallel + """ + import os + import sys + import psutil + from multiprocessing import Pool + from nipype import logging + from nipype.utils.profiler import _use_cpu + + iflogger = logging.getLogger('interface') + + # Getsize of one character string + BSIZE = sys.getsizeof(' ') - sys.getsizeof(' ') + BOFFSET = sys.getsizeof('') + _GB = 1024.0**3 + + def _use_gb_ram(mem_gb): + """A test function to consume mem_gb GB of RAM""" + num_bytes = int(mem_gb * _GB) + # Eat mem_gb GB of memory for 1 second + gb_str = ' ' * ((num_bytes - BOFFSET) // BSIZE) + assert sys.getsizeof(gb_str) == num_bytes + return gb_str + + # Measure the amount of memory this process already holds + p = psutil.Process(os.getpid()) + mem_offset = p.memory_info().rss / _GB + big_str = _use_gb_ram(mem_gb - mem_offset) + _use_cpu(5) + mem_total = p.memory_info().rss / _GB + del big_str + iflogger.info('[%d] Memory offset %0.2fGB, total %0.2fGB', + os.getpid(), mem_offset, mem_total) + + if n_procs > 1: + pool = Pool(n_procs) + pool.map(_use_cpu, range(n_procs)) + return True diff --git a/nipype/utils/provenance.py b/nipype/utils/provenance.py index 066cbb9a57..c316f67272 100644 --- a/nipype/utils/provenance.py +++ b/nipype/utils/provenance.py @@ -21,7 +21,7 @@ from .. import get_info, logging, __version__ from .filemanip import (md5, hashlib, hash_infile) -iflogger = logging.getLogger('interface') +logger = logging.getLogger('utils') foaf = pm.Namespace("foaf", "http://xmlns.com/foaf/0.1/") dcterms = pm.Namespace("dcterms", "http://purl.org/dc/terms/") nipype_ns = pm.Namespace("nipype", "http://nipy.org/nipype/terms/") @@ -173,7 +173,7 @@ def safe_encode(x, as_literal=True): jsonstr = json.dumps(outdict) except UnicodeDecodeError as excp: jsonstr = "Could not encode dictionary. {}".format(excp) - iflogger.warn('Prov: %s', jsonstr) + logger.warning('Prov: %s', jsonstr) if not as_literal: return jsonstr @@ -203,7 +203,7 @@ def safe_encode(x, as_literal=True): jsonstr = json.dumps(x) except UnicodeDecodeError as excp: jsonstr = "Could not encode list/tuple. {}".format(excp) - iflogger.warn('Prov: %s', jsonstr) + logger.warning('Prov: %s', jsonstr) if not as_literal: return jsonstr @@ -285,9 +285,20 @@ def prov_encode(graph, value, create_container=True): def write_provenance(results, filename='provenance', format='all'): - ps = ProvStore() - ps.add_results(results) - return ps.write_provenance(filename=filename, format=format) + prov = None + try: + ps = ProvStore() + ps.add_results(results) + prov = ps.write_provenance(filename=filename, format=format) + except Exception as e: + import traceback + err_msg = traceback.format_exc() + if getattr(e, 'args'): + err_msg += '\n\nException arguments:\n' + ', '.join(['"%s"' % arg for arg in e.args]) + logger.warning('Writing provenance failed - Exception details:\n%s', err_msg) + + return prov + class ProvStore(object): diff --git a/nipype/utils/spm_docs.py b/nipype/utils/spm_docs.py index 1b7a1a1dc4..3b9942f0af 100644 --- a/nipype/utils/spm_docs.py +++ b/nipype/utils/spm_docs.py @@ -27,7 +27,7 @@ def grab_doc(task_name): """ - cmd = matlab.MatlabCommandLine() + cmd = matlab.MatlabCommand(resource_monitor=False) # We need to tell Matlab where to find our spm_get_doc.m file. cwd = os.path.dirname(__file__) # Build matlab command diff --git a/nipype/utils/tests/use_resources b/nipype/utils/tests/use_resources new file mode 100755 index 0000000000..fd2e860a1a --- /dev/null +++ b/nipype/utils/tests/use_resources @@ -0,0 +1,31 @@ +#!/usr/bin/env python +# +# use_resources + +''' +Python script to use a certain amount of RAM on disk and number of +threads + +Usage: + use_resources -g -p +''' + +# Make main executable +if __name__ == '__main__': + + # Import packages + import argparse + from nipype.utils.profiler import _use_resources + + # Init argparser + parser = argparse.ArgumentParser(description=__doc__) + + # Add arguments + parser.add_argument('-g', '--num_gb', required=True, type=float, + help='Number of GB RAM to use, can be float or int') + parser.add_argument('-p', '--num_threads', required=True, type=int, + help='Number of threads to run in parallel') + + # Parse args + args = parser.parse_args() + _use_resources(args.num_threads, args.num_gb)