From b44ef0d600b50ca5ab4c38ab725e05bea452fb96 Mon Sep 17 00:00:00 2001 From: tyarkoni Date: Sun, 17 Jul 2016 14:03:48 -0500 Subject: [PATCH 01/10] first pass at working event transformer interface --- nipype/algorithms/events.py | 349 ++++++++++++++++++++++++++++++++++++ 1 file changed, 349 insertions(+) create mode 100644 nipype/algorithms/events.py diff --git a/nipype/algorithms/events.py b/nipype/algorithms/events.py new file mode 100644 index 0000000000..83941485bd --- /dev/null +++ b/nipype/algorithms/events.py @@ -0,0 +1,349 @@ +from __future__ import division +from builtins import range + +from copy import deepcopy +import os + +from nibabel import load +import numpy as np +from nipype.external.six import string_types +from nipype.interfaces.base import (BaseInterface, TraitedSpec, InputMultiPath, + traits, File, Bunch, BaseInterfaceInputSpec, + isdefined, OutputMultiPath) +from nipype import config, logging +import re +from glob import glob +from os.path import basename +iflogger = logging.getLogger('interface') + +# Eventually, move the following inside the interface, or wrap in an import check +import pandas as pd +import sklearn.preprocessing as preproc +from sklearn.linear_model import LinearRegression + + +def alias(target, append=False): + def decorator(func): + def wrapper(self, cols, groupby=None, names=None, *args, **kwargs): + + cols = self._select_cols(cols) + + # groupby can be either a single column, in which case it's + # interpreted as a categorical variable to groupby directly, or a + # list of column names, in which case it's interpreted as a set of + # dummy columns to reconstruct a categorical from. + if groupby is not None: + groupby = self._select_cols(groupby) + if len(groupby) > 1: + group_results = [] + names = ['%s_%s' % (cn, gn)for cn in cols for gn in groupby] + for i, col in enumerate(groupby): + _result = np.zeros((len(self.data), len(cols))) + inds = self.data[col].nonzero() + _result[inds] = target(self.data[cols].iloc[inds], *args, **kwargs) + group_results.extend(_result.T) + result = np.c_[group_results].squeeze().T + result = pd.DataFrame(result, columns=names) + else: + result = self.data.groupby(groupby).apply(target, *args, **kwargs) + else: + result = target(self.data[cols], *args, **kwargs) + + if append: + names = result.columns + + if names is not None: + cols = names + + self.data[cols] = result + return wrapper + return decorator + + +class EventTransformer(object): + + def __init__(self, events, orig_hz=1, target_hz=1000): + self.events = events + self.orig_hz = orig_hz + self.target_hz = target_hz + self._to_dense() + + def _select_cols(self, cols): + if isinstance(cols, string_types) and '*' in cols: + # if '*' in cols: + patt = re.compile(cols.replace('*', '.*')) + cols = [l for l in self.data.columns.tolist() for m in [patt.search(l)] if m] + return cols + + @alias(preproc.scale) + def standardize(): pass + + @alias(np.log) + def log(): pass + + @alias(preproc.binarize) + def binarize(): pass + + @alias(np.logical_or) + def or_(): pass + + @alias(np.logical_and) + def and_(): pass + + @alias(np.logical_not) + def not_(): pass + + @alias(pd.get_dummies, append=True) + def factor(): pass + + def orthogonalize(self, y_cols, x_cols): + y_cols, x_cols = self._select_cols(y_cols), listify(x_cols) + x = self.data[x_cols].values + y = self.data[y_cols].values + est = LinearRegression() + est.fit(x, y) + y -= est.predict(x) + if np.linalg.norm(y, 1) > np.exp(-32): + self.data[y_cols] = y + + def formula(self, f, target=None, replace=False, *args, **kwargs): + from patsy import dmatrix + result = dmatrix(f, self.data, return_type='dataframe', *args, **kwargs) + if target is not None: + self.data[target] = result + elif replace: + self.data[result.columns] = result + else: + raise ValueError("Either a target column must be passed or replace" + " must be True.") + + def multiply(self, y_cols, x_cols): + x_cols = self._select_cols(x_cols) + result = self.data[x_cols].apply(lambda x: np.multiply(x, self.data[y_cols])) + names = ['%s_%s' % (y_cols, x) for x in x_cols] + self.data[names] = result + + def query(self, q, *args, **kwargs): + self.data = self.data.query(filter) + + def apply(self, func, *args, **kwargs): + self.data = func(self.data, *args, **kwargs) + + def _to_dense(self): + + end = int((self.events['onset'] + self.events['duration']).max()) + + targ_hz, orig_hz = self.target_hz, self.orig_hz + len_ts = end * targ_hz + conditions = self.events['condition'].unique().tolist() + n_conditions = len(conditions) + ts = np.zeros((len_ts, n_conditions)) + + _events = self.events.copy().reset_index() + _events[['onset', 'duration']] = _events[ + ['onset', 'duration']] * targ_hz / orig_hz + + cond_index = [conditions.index(c) for c in _events['condition']] + ev_end = np.round(_events['onset'] + _events['duration']).astype(int) + onsets = np.round(_events['onset']).astype(int) + + for i, row in _events.iterrows(): + ts[onsets[i]:ev_end[i], cond_index[i]] = row['amplitude'] + + self.data = pd.DataFrame(ts, columns=conditions) + onsets = np.arange(len(ts)) / self.target_hz + self.data.insert(0, 'onset', onsets) + + def resample(self, sampling_rate): + sampling_rate = np.round(sampling_rate * 1000) + self.data.index = pd.to_datetime(self.data['onset'], unit='s') + self.data = self.data.resample('%dL' % sampling_rate).mean() + self.data['onset'] = self.data.index.astype(np.int64) / int(1e9) + self.data = self.data.reset_index(drop=True) + + +class EventReader(object): + + def __init__(self, columns=None, header='infer', sep=None, + default_duration=0., default_amplitude=1., + condition_pattern=None, subject_pattern=None, + run_pattern=None): + ''' + Args: + columns (list): Optional list of column names to use. If passed, + number of elements must match number of columns in the text + files to be read. If omitted, column names are inferred by + pandas (depending on value of header). + header (str): passed to pandas; see pd.read_table docs for details. + sep (str): column separator; see pd.read_table docs for details. + default_duration (float): Optional default duration to set for all + events. Will be ignored if a column named 'duration' is found. + default_amplitude (float): Optional default amplitude to set for + all events. Will be ignored if an amplitude column is found. + condition_pattern (str): regex with which to capture condition + names from input text file filenames. Only the first captured + group will be used. + subject_pattern (str): regex with which to capture subject + names from input text file filenames. Only the first captured + group will be used. + run_pattern (str): regex with which to capture run names from input + text file filenames. Only the first captured group will be used. + ''' + + self.columns = columns + self.header = header + self.sep = sep + self.default_duration = default_duration + self.default_amplitude = default_amplitude + self.condition_pattern = condition_pattern + self.subject_pattern = subject_pattern + self.run_pattern = run_pattern + + def read(self, path, condition=None, subject=None, run=None, rename=None): + + dfs = [] + + if isinstance(path, string_types): + path = glob(path) + + for f in path: + _data = pd.read_table(f, names=self.columns, header=self.header, + sep=self.sep) + + if rename is not None: + _data = _data.rename(rename) + + # Validate and set CODA columns + cols = _data.columns + + if 'onset' not in cols: + raise ValueError( + "DataFrame is missing mandatory 'onset' column.") + + if 'duration' not in cols: + if self.default_duration is None: + raise ValueError( + 'Event file "%s" is missing \'duration\'' + ' column, and no default_duration was provided.' % f) + else: + _data['duration'] = self.default_duration + + if 'amplitude' not in cols: + _data['amplitude'] = self.default_amplitude + + if condition is not None: + _data['condition'] = condition_name + elif 'condition' not in cols: + cp = self.condition_pattern + if cp is None: + cp = '(.*)\.[a-zA-Z0-9]{3,4}' + m = re.search(cp, basename(f)) + if m is None: + raise ValueError( + "No condition column found in event file, no " + "condition_name argument passed, and attempt to " + "automatically extract condition from filename failed." + " Please make sure a condition is specified.") + _data['condition'] = m.group(1) + + if subject is not None: + _data['subject'] = subject + elif self.subject_pattern is not None: + m = re.search(self.subject_pattern, f) + if m is None: + raise ValueError( + "Subject pattern '%s' failed to match any part of " + "filename '%s'." % (self.subject_pattern, f)) + _data['subject'] = m.group(1) + + if run is not None: + _data['run'] = run + elif self.run_pattern is not None: + m = re.search(self.run_pattern, f) + if m is None: + raise ValueError( + "Run pattern '%s' failed to match any part of " + "filename '%s'." % (self.run_pattern, f)) + _data['run'] = m.group(1) + + dfs.append(_data) + + return pd.concat(dfs, axis=0) + + +class TransformEventsInputSpec(BaseInterfaceInputSpec): + subject_info = InputMultiPath(Bunch, mandatory=True, xor=['subject_info', + 'event_files'], + desc=("Bunch or List(Bunch) subject specific condition information. " + "see :ref:`SpecifyModel` or SpecifyModel.__doc__ for details")) + event_files = InputMultiPath(traits.List(File(exists=True)), mandatory=True, + xor=['subject_info', 'event_files'], + desc=('list of event description files in 1, 2, 3, or 4 column format ' + 'corresponding to onsets, durations, amplitudes, and names')) + input_units = traits.Enum('secs', 'scans', mandatory=True, + desc=("Units of event onsets and durations (secs or scans). Output " + "units are always in secs")) + time_repetition = traits.Float(mandatory=True, + desc=("Time between the start of one volume to the start of " + "the next image volume.")) + # transformations = InputMultiPath(traits.List(File(exists=True)), mandatory=True, + # desc=("JSON specification of the transformations to perform.")) + + +class TransformEventsOutputSpec(TraitedSpec): + subject_info = OutputMultiPath(Bunch, mandatory=True, + desc=("Bunch or List(Bunch) subject specific condition information. " + "see :ref:`SpecifyModel` or SpecifyModel.__doc__ for details")) + + +class TransformEvents(BaseInterface): + + input_spec = TransformEventsInputSpec + output_spec = TransformEventsOutputSpec + + def _get_event_data(self): + if isdefined(self.inputs.subject_info): + info = self.inputs.subject_info + return pd.from_records(info) + else: + info = self.inputs.event_files + reader = EventReader(columns=['onset', 'duration', 'amplitude']) + return reader.read(info[0]) + + def _transform_events(self): + events = self._get_event_data() + self.transformer = EventTransformer(events) + print(self.transformer.data.shape) + # Transformation application logic goes here later + # ... + self.transformer.resample(self.inputs.time_repetition) + print(self.transformer.data.shape) + + def _run_interface(self, runtime): + self._transform_events() + return runtime + + def _df_to_bunch(self): + + if not hasattr(self, 'transformer'): + self._transform_events() + + _data = self.transformer.data + + info = Bunch(conditions=[], onsets=[], durations=[], amplitudes=[]) + cols = [c for c in _data.columns if c not in {'onset'}] + onsets = _data['onset'].values.tolist() + info.conditions = cols + + for col in _data.columns: + info.onsets.append(onsets) + info.durations.append(self.inputs.time_repetition) + info.amplitudes.append(_data[col].values.tolist()) + + return info + + def _list_outputs(self): + outputs = self._outputs().get() + outputs['subject_info'] = self._df_to_bunch() + return outputs + From 772d280237a3ca1e6a2d9b39b9a5cbf13dbfa81c Mon Sep 17 00:00:00 2001 From: tyarkoni Date: Sun, 17 Jul 2016 19:06:15 -0500 Subject: [PATCH 02/10] adds Transformation class to represent one or more chained operations on variables --- nipype/algorithms/events.py | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/nipype/algorithms/events.py b/nipype/algorithms/events.py index 83941485bd..74f68cda12 100644 --- a/nipype/algorithms/events.py +++ b/nipype/algorithms/events.py @@ -130,7 +130,9 @@ def apply(self, func, *args, **kwargs): self.data = func(self.data, *args, **kwargs) def _to_dense(self): - + """ Convert the sparse [onset, duration, amplitude] representation + typical of event files to a dense matrix where each row represents + a fixed unit of time. """ end = int((self.events['onset'] + self.events['duration']).max()) targ_hz, orig_hz = self.target_hz, self.orig_hz @@ -155,6 +157,11 @@ def _to_dense(self): self.data.insert(0, 'onset', onsets) def resample(self, sampling_rate): + """ + Resample the design matrix to the specified sampling rate. Primarily + useful for downsampling to match the TR, so as to export the design as + a n(TR) x n(conds) matrix. + """ sampling_rate = np.round(sampling_rate * 1000) self.data.index = pd.to_datetime(self.data['onset'], unit='s') self.data = self.data.resample('%dL' % sampling_rate).mean() @@ -296,6 +303,25 @@ class TransformEventsOutputSpec(TraitedSpec): "see :ref:`SpecifyModel` or SpecifyModel.__doc__ for details")) +class Transformation(object): + + def __init__(self, transformer, inputs, steps, outputs=None): + + self.transformer = transformer + self.inputs = inputs + self.steps = steps + self.outputs = outputs + self._validate() + + def _validate(self): + missing = set(transform['input'] - set(self.transformer.data.columns)): + if missing: + raise ValueError("Invalid column(s): %s" % missing) + + def run(self): + pass + + class TransformEvents(BaseInterface): input_spec = TransformEventsInputSpec @@ -313,11 +339,10 @@ def _get_event_data(self): def _transform_events(self): events = self._get_event_data() self.transformer = EventTransformer(events) - print(self.transformer.data.shape) + # Transformation application logic goes here later # ... self.transformer.resample(self.inputs.time_repetition) - print(self.transformer.data.shape) def _run_interface(self, runtime): self._transform_events() From e68ad8c18c804296ec9c7099f1d31017ddc49ea9 Mon Sep 17 00:00:00 2001 From: tyarkoni Date: Sun, 17 Jul 2016 19:30:37 -0500 Subject: [PATCH 03/10] drop sklearn dependency in orthogonalization --- nipype/algorithms/events.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/nipype/algorithms/events.py b/nipype/algorithms/events.py index 74f68cda12..50bb3b82a2 100644 --- a/nipype/algorithms/events.py +++ b/nipype/algorithms/events.py @@ -19,7 +19,6 @@ # Eventually, move the following inside the interface, or wrap in an import check import pandas as pd import sklearn.preprocessing as preproc -from sklearn.linear_model import LinearRegression def alias(target, append=False): @@ -96,15 +95,16 @@ def not_(): pass @alias(pd.get_dummies, append=True) def factor(): pass - def orthogonalize(self, y_cols, x_cols): - y_cols, x_cols = self._select_cols(y_cols), listify(x_cols) - x = self.data[x_cols].values + def orthogonalize(self, y_cols, X_cols): + ''' Orthogonalize each of the variables in y_cols with respect to all + of the variables in x_cols. + ''' + y_cols, X_cols = self._select_cols(y_cols), self._select_cols(X_cols) + X = self.data[X_cols].values y = self.data[y_cols].values - est = LinearRegression() - est.fit(x, y) - y -= est.predict(x) - if np.linalg.norm(y, 1) > np.exp(-32): - self.data[y_cols] = y + _aX = np.c_[np.ones(len(y)), X] + coefs, resids, rank, s = np.linalg.lstsq(_aX, y) + self.data[y_cols] = y - X.dot(coefs[1:]) def formula(self, f, target=None, replace=False, *args, **kwargs): from patsy import dmatrix @@ -314,7 +314,7 @@ def __init__(self, transformer, inputs, steps, outputs=None): self._validate() def _validate(self): - missing = set(transform['input'] - set(self.transformer.data.columns)): + missing = set(transform['input'] - set(self.transformer.data.columns)) if missing: raise ValueError("Invalid column(s): %s" % missing) From 78f7b707fe09379e2cec242a997a47f226e16fb7 Mon Sep 17 00:00:00 2001 From: tyarkoni Date: Sun, 17 Jul 2016 20:08:23 -0500 Subject: [PATCH 04/10] replace sklearn preproc dependencies --- nipype/algorithms/events.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/nipype/algorithms/events.py b/nipype/algorithms/events.py index 50bb3b82a2..3502602436 100644 --- a/nipype/algorithms/events.py +++ b/nipype/algorithms/events.py @@ -18,7 +18,6 @@ # Eventually, move the following inside the interface, or wrap in an import check import pandas as pd -import sklearn.preprocessing as preproc def alias(target, append=False): @@ -74,15 +73,9 @@ def _select_cols(self, cols): cols = [l for l in self.data.columns.tolist() for m in [patt.search(l)] if m] return cols - @alias(preproc.scale) - def standardize(): pass - @alias(np.log) def log(): pass - @alias(preproc.binarize) - def binarize(): pass - @alias(np.logical_or) def or_(): pass @@ -95,6 +88,19 @@ def not_(): pass @alias(pd.get_dummies, append=True) def factor(): pass + def standardize(self, cols, demean=True, rescale=True, copy=True): + cols = self._select_cols(cols) + X = self.data[cols] + self.data[cols] = (X - X.mean()) / np.std(X, 0) + + def binarize(self, cols, threshold=0.0): + cols = self._select_cols(cols) + X = self.data[cols].values + above = X > threshold + X[above] = 1 + X[~above] = 0 + self.data[cols] = X + def orthogonalize(self, y_cols, X_cols): ''' Orthogonalize each of the variables in y_cols with respect to all of the variables in x_cols. From 8b9cd569c5e4bfd1b7ec072939b7eca8365ef56a Mon Sep 17 00:00:00 2001 From: tyarkoni Date: Mon, 1 Aug 2016 09:03:31 -0700 Subject: [PATCH 05/10] renamed interface; minor tweaks --- nipype/algorithms/events.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/nipype/algorithms/events.py b/nipype/algorithms/events.py index 3502602436..13f1c8af64 100644 --- a/nipype/algorithms/events.py +++ b/nipype/algorithms/events.py @@ -17,7 +17,10 @@ iflogger = logging.getLogger('interface') # Eventually, move the following inside the interface, or wrap in an import check -import pandas as pd +try: + import pandas as pd +except ImportError: + raise ImportError("The events module requires pandas.") def alias(target, append=False): @@ -88,12 +91,15 @@ def not_(): pass @alias(pd.get_dummies, append=True) def factor(): pass - def standardize(self, cols, demean=True, rescale=True, copy=True): + def _standardize(self, cols, demean=True, rescale=True, copy=True): cols = self._select_cols(cols) X = self.data[cols] self.data[cols] = (X - X.mean()) / np.std(X, 0) - def binarize(self, cols, threshold=0.0): + @alias(_standardize) + def standardize(): pass + + def _binarize(self, cols, threshold=0.0): cols = self._select_cols(cols) X = self.data[cols].values above = X > threshold @@ -101,6 +107,9 @@ def binarize(self, cols, threshold=0.0): X[~above] = 0 self.data[cols] = X + @alias(_binarize) + def binarize(): pass + def orthogonalize(self, y_cols, X_cols): ''' Orthogonalize each of the variables in y_cols with respect to all of the variables in x_cols. @@ -148,8 +157,7 @@ def _to_dense(self): ts = np.zeros((len_ts, n_conditions)) _events = self.events.copy().reset_index() - _events[['onset', 'duration']] = _events[ - ['onset', 'duration']] * targ_hz / orig_hz + _events[['onset', 'duration']] = _events[['onset', 'duration']] * targ_hz / orig_hz cond_index = [conditions.index(c) for c in _events['condition']] ev_end = np.round(_events['onset'] + _events['duration']).astype(int) @@ -284,7 +292,7 @@ def read(self, path, condition=None, subject=None, run=None, rename=None): return pd.concat(dfs, axis=0) -class TransformEventsInputSpec(BaseInterfaceInputSpec): +class SpecifyEventsInputSpec(BaseInterfaceInputSpec): subject_info = InputMultiPath(Bunch, mandatory=True, xor=['subject_info', 'event_files'], desc=("Bunch or List(Bunch) subject specific condition information. " @@ -303,7 +311,7 @@ class TransformEventsInputSpec(BaseInterfaceInputSpec): # desc=("JSON specification of the transformations to perform.")) -class TransformEventsOutputSpec(TraitedSpec): +class SpecifyEventsOutputSpec(TraitedSpec): subject_info = OutputMultiPath(Bunch, mandatory=True, desc=("Bunch or List(Bunch) subject specific condition information. " "see :ref:`SpecifyModel` or SpecifyModel.__doc__ for details")) @@ -328,10 +336,10 @@ def run(self): pass -class TransformEvents(BaseInterface): +class SpecifyEvents(BaseInterface): - input_spec = TransformEventsInputSpec - output_spec = TransformEventsOutputSpec + input_spec = SpecifyEventsInputSpec + output_spec = SpecifyEventsOutputSpec def _get_event_data(self): if isdefined(self.inputs.subject_info): From 095154afa296d98db3e155311a4e0551e75671c1 Mon Sep 17 00:00:00 2001 From: tyarkoni Date: Mon, 1 Aug 2016 14:24:50 -0700 Subject: [PATCH 06/10] working data transformations --- nipype/algorithms/events.py | 91 ++++++++++++++++++++++--------------- 1 file changed, 55 insertions(+), 36 deletions(-) diff --git a/nipype/algorithms/events.py b/nipype/algorithms/events.py index 13f1c8af64..98394ac29b 100644 --- a/nipype/algorithms/events.py +++ b/nipype/algorithms/events.py @@ -14,6 +14,7 @@ import re from glob import glob from os.path import basename +import json iflogger = logging.getLogger('interface') # Eventually, move the following inside the interface, or wrap in an import check @@ -25,38 +26,38 @@ def alias(target, append=False): def decorator(func): - def wrapper(self, cols, groupby=None, names=None, *args, **kwargs): + def wrapper(self, cols, groupby=None, output=None, *args, **kwargs): cols = self._select_cols(cols) # groupby can be either a single column, in which case it's # interpreted as a categorical variable to groupby directly, or a - # list of column names, in which case it's interpreted as a set of + # list of column output, in which case it's interpreted as a set of # dummy columns to reconstruct a categorical from. if groupby is not None: groupby = self._select_cols(groupby) if len(groupby) > 1: group_results = [] - names = ['%s_%s' % (cn, gn)for cn in cols for gn in groupby] + output = ['%s_%s' % (cn, gn)for cn in cols for gn in groupby] for i, col in enumerate(groupby): _result = np.zeros((len(self.data), len(cols))) - inds = self.data[col].nonzero() - _result[inds] = target(self.data[cols].iloc[inds], *args, **kwargs) + _result[inds] = target(self, cols, *args, **kwargs) group_results.extend(_result.T) result = np.c_[group_results].squeeze().T - result = pd.DataFrame(result, columns=names) + result = pd.DataFrame(result, columns=output) else: - result = self.data.groupby(groupby).apply(target, *args, **kwargs) + result = self.data.groupby(groupby).apply(target, self, cols, *args, **kwargs) else: - result = target(self.data[cols], *args, **kwargs) + result = target(self, cols, *args, **kwargs) if append: - names = result.columns + output = result.columns - if names is not None: - cols = names - - self.data[cols] = result + if output is not None: + result = pd.DataFrame(result, columns=output) + self.data = self.data.join(result) + else: + self.data[cols] = result return wrapper return decorator @@ -94,7 +95,13 @@ def factor(): pass def _standardize(self, cols, demean=True, rescale=True, copy=True): cols = self._select_cols(cols) X = self.data[cols] - self.data[cols] = (X - X.mean()) / np.std(X, 0) + return (X - X.mean()) / np.std(X, 0) + + def select(self, cols): + # Always retain onsets + if 'onset' not in cols: + cols.insert(0, 'onset') + self.data = self.data[self._select_cols(cols)] @alias(_standardize) def standardize(): pass @@ -105,21 +112,24 @@ def _binarize(self, cols, threshold=0.0): above = X > threshold X[above] = 1 X[~above] = 0 - self.data[cols] = X + return X @alias(_binarize) def binarize(): pass - def orthogonalize(self, y_cols, X_cols): - ''' Orthogonalize each of the variables in y_cols with respect to all + def _orthogonalize(self, cols, X_cols): + ''' Orthogonalize each of the variables in cols with respect to all of the variables in x_cols. ''' - y_cols, X_cols = self._select_cols(y_cols), self._select_cols(X_cols) + cols, X_cols = self._select_cols(cols), self._select_cols(X_cols) X = self.data[X_cols].values - y = self.data[y_cols].values + y = self.data[cols].values _aX = np.c_[np.ones(len(y)), X] coefs, resids, rank, s = np.linalg.lstsq(_aX, y) - self.data[y_cols] = y - X.dot(coefs[1:]) + return y - X.dot(coefs[1:]) + + @alias(_orthogonalize) + def orthogonalize(): pass def formula(self, f, target=None, replace=False, *args, **kwargs): from patsy import dmatrix @@ -132,17 +142,19 @@ def formula(self, f, target=None, replace=False, *args, **kwargs): raise ValueError("Either a target column must be passed or replace" " must be True.") - def multiply(self, y_cols, x_cols): + def multiply(self, cols, x_cols): x_cols = self._select_cols(x_cols) - result = self.data[x_cols].apply(lambda x: np.multiply(x, self.data[y_cols])) - names = ['%s_%s' % (y_cols, x) for x in x_cols] - self.data[names] = result + result = self.data[x_cols].apply(lambda x: np.multiply(x, self.data[cols])) + output = ['%s_%s' % (cols, x) for x in x_cols] + self.data[output] = result def query(self, q, *args, **kwargs): self.data = self.data.query(filter) def apply(self, func, *args, **kwargs): - self.data = func(self.data, *args, **kwargs) + if isinstance(func, string_types): + func = getattr(self, func) + func(*args, **kwargs) def _to_dense(self): """ Convert the sparse [onset, duration, amplitude] representation @@ -191,9 +203,9 @@ def __init__(self, columns=None, header='infer', sep=None, run_pattern=None): ''' Args: - columns (list): Optional list of column names to use. If passed, + columns (list): Optional list of column output to use. If passed, number of elements must match number of columns in the text - files to be read. If omitted, column names are inferred by + files to be read. If omitted, column output are inferred by pandas (depending on value of header). header (str): passed to pandas; see pd.read_table docs for details. sep (str): column separator; see pd.read_table docs for details. @@ -202,13 +214,13 @@ def __init__(self, columns=None, header='infer', sep=None, default_amplitude (float): Optional default amplitude to set for all events. Will be ignored if an amplitude column is found. condition_pattern (str): regex with which to capture condition - names from input text file filenames. Only the first captured + output from input text file fileoutput. Only the first captured group will be used. subject_pattern (str): regex with which to capture subject - names from input text file filenames. Only the first captured + output from input text file fileoutput. Only the first captured group will be used. - run_pattern (str): regex with which to capture run names from input - text file filenames. Only the first captured group will be used. + run_pattern (str): regex with which to capture run output from input + text file fileoutput. Only the first captured group will be used. ''' self.columns = columns @@ -300,15 +312,15 @@ class SpecifyEventsInputSpec(BaseInterfaceInputSpec): event_files = InputMultiPath(traits.List(File(exists=True)), mandatory=True, xor=['subject_info', 'event_files'], desc=('list of event description files in 1, 2, 3, or 4 column format ' - 'corresponding to onsets, durations, amplitudes, and names')) + 'corresponding to onsets, durations, amplitudes, and output')) input_units = traits.Enum('secs', 'scans', mandatory=True, desc=("Units of event onsets and durations (secs or scans). Output " "units are always in secs")) time_repetition = traits.Float(mandatory=True, desc=("Time between the start of one volume to the start of " "the next image volume.")) - # transformations = InputMultiPath(traits.List(File(exists=True)), mandatory=True, - # desc=("JSON specification of the transformations to perform.")) + transformations = traits.File(exists=True, mandatory=False, + desc=("JSON specification of the transformations to perform.")) class SpecifyEventsOutputSpec(TraitedSpec): @@ -353,9 +365,16 @@ def _get_event_data(self): def _transform_events(self): events = self._get_event_data() self.transformer = EventTransformer(events) + if isdefined(self.inputs.transformations): + tf = json.load(open(self.inputs.transformations)) + for t in tf['steps']: + name = t.pop('name') + cols = t.pop('input', None) + # args = t.get('args', {}) + self.transformer.apply(name, cols, **t) + if 'select' in tf: + self.transformer.select(tf['select']) - # Transformation application logic goes here later - # ... self.transformer.resample(self.inputs.time_repetition) def _run_interface(self, runtime): From bb4f6f40a4345770683451d0524f91a34275d58e Mon Sep 17 00:00:00 2001 From: tyarkoni Date: Tue, 2 Aug 2016 09:03:12 -0700 Subject: [PATCH 07/10] moved import exception raising into _run_interface --- nipype/algorithms/events.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/nipype/algorithms/events.py b/nipype/algorithms/events.py index 98394ac29b..62d0d0d8af 100644 --- a/nipype/algorithms/events.py +++ b/nipype/algorithms/events.py @@ -1,27 +1,23 @@ from __future__ import division -from builtins import range -from copy import deepcopy -import os - -from nibabel import load import numpy as np from nipype.external.six import string_types from nipype.interfaces.base import (BaseInterface, TraitedSpec, InputMultiPath, traits, File, Bunch, BaseInterfaceInputSpec, isdefined, OutputMultiPath) -from nipype import config, logging +from nipype import logging import re from glob import glob from os.path import basename import json iflogger = logging.getLogger('interface') -# Eventually, move the following inside the interface, or wrap in an import check + +have_pandas = True try: import pandas as pd -except ImportError: - raise ImportError("The events module requires pandas.") +except: + have_nipy = False def alias(target, append=False): @@ -370,7 +366,6 @@ def _transform_events(self): for t in tf['steps']: name = t.pop('name') cols = t.pop('input', None) - # args = t.get('args', {}) self.transformer.apply(name, cols, **t) if 'select' in tf: self.transformer.select(tf['select']) @@ -378,6 +373,9 @@ def _transform_events(self): self.transformer.resample(self.inputs.time_repetition) def _run_interface(self, runtime): + if not have_pandas: + raise ImportError("The SpecifyEvents interface requires pandas. " + "Please make sure that pandas is installed.") self._transform_events() return runtime From 2878f47adb2f94c87191e8266e28d3c03d0c6f1c Mon Sep 17 00:00:00 2001 From: tyarkoni Date: Tue, 2 Aug 2016 09:03:28 -0700 Subject: [PATCH 08/10] added column renaming --- nipype/algorithms/events.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nipype/algorithms/events.py b/nipype/algorithms/events.py index 62d0d0d8af..ab9399a1b8 100644 --- a/nipype/algorithms/events.py +++ b/nipype/algorithms/events.py @@ -144,6 +144,10 @@ def multiply(self, cols, x_cols): output = ['%s_%s' % (cols, x) for x in x_cols] self.data[output] = result + def rename(self, cols, output): + rename = dict(zip(cols, output)) + self.data = self.data.rename(columns=rename) + def query(self, q, *args, **kwargs): self.data = self.data.query(filter) From c17af770c83cba26e5355f2a88837a8ee2d8ce73 Mon Sep 17 00:00:00 2001 From: tyarkoni Date: Tue, 2 Aug 2016 09:21:56 -0700 Subject: [PATCH 09/10] removed Transformation class --- nipype/algorithms/events.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/nipype/algorithms/events.py b/nipype/algorithms/events.py index ab9399a1b8..7022ce900b 100644 --- a/nipype/algorithms/events.py +++ b/nipype/algorithms/events.py @@ -329,25 +329,6 @@ class SpecifyEventsOutputSpec(TraitedSpec): "see :ref:`SpecifyModel` or SpecifyModel.__doc__ for details")) -class Transformation(object): - - def __init__(self, transformer, inputs, steps, outputs=None): - - self.transformer = transformer - self.inputs = inputs - self.steps = steps - self.outputs = outputs - self._validate() - - def _validate(self): - missing = set(transform['input'] - set(self.transformer.data.columns)) - if missing: - raise ValueError("Invalid column(s): %s" % missing) - - def run(self): - pass - - class SpecifyEvents(BaseInterface): input_spec = SpecifyEventsInputSpec From db642fa0461d174879dafe665e471471ead43332 Mon Sep 17 00:00:00 2001 From: tyarkoni Date: Tue, 2 Aug 2016 10:32:52 -0700 Subject: [PATCH 10/10] better approach to transformation mapping --- nipype/algorithms/events.py | 97 ++++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 45 deletions(-) diff --git a/nipype/algorithms/events.py b/nipype/algorithms/events.py index 7022ce900b..180fc3e92f 100644 --- a/nipype/algorithms/events.py +++ b/nipype/algorithms/events.py @@ -20,11 +20,44 @@ have_nipy = False +class Transformations(object): + + @staticmethod + def standardize(data, demean=True, rescale=True): + if demean: + data -= data.mean(0) + if rescale: + data /= data.std(0) + return data + + @staticmethod + def orthogonalize(data, other): + ''' Orthogonalize each of the variables in cols with respect to all + of the variables in x_cols. + ''' + y = data.values + X = other.values + _aX = np.c_[np.ones(len(y)), X] + coefs, resids, rank, s = np.linalg.lstsq(_aX, y) + return y - X.dot(coefs[1:]) + + @staticmethod + def binarize(data, threshold=0.0): + above = data > threshold + data[above] = 1 + data[~above] = 0 + return data + + def alias(target, append=False): def decorator(func): def wrapper(self, cols, groupby=None, output=None, *args, **kwargs): cols = self._select_cols(cols) + data = self.data[cols] + + if 'other' in kwargs: + kwargs['other'] = self.data[kwargs['other']] # groupby can be either a single column, in which case it's # interpreted as a categorical variable to groupby directly, or a @@ -36,15 +69,14 @@ def wrapper(self, cols, groupby=None, output=None, *args, **kwargs): group_results = [] output = ['%s_%s' % (cn, gn)for cn in cols for gn in groupby] for i, col in enumerate(groupby): - _result = np.zeros((len(self.data), len(cols))) - _result[inds] = target(self, cols, *args, **kwargs) + _result = target(data, *args, **kwargs) group_results.extend(_result.T) result = np.c_[group_results].squeeze().T result = pd.DataFrame(result, columns=output) else: - result = self.data.groupby(groupby).apply(target, self, cols, *args, **kwargs) + result = self.data.groupby(groupby).apply(target, data, *args, **kwargs) else: - result = target(self, cols, *args, **kwargs) + result = target(data, *args, **kwargs) if append: output = result.columns @@ -66,13 +98,7 @@ def __init__(self, events, orig_hz=1, target_hz=1000): self.target_hz = target_hz self._to_dense() - def _select_cols(self, cols): - if isinstance(cols, string_types) and '*' in cols: - # if '*' in cols: - patt = re.compile(cols.replace('*', '.*')) - cols = [l for l in self.data.columns.tolist() for m in [patt.search(l)] if m] - return cols - + ### Aliased functions ### @alias(np.log) def log(): pass @@ -88,45 +114,22 @@ def not_(): pass @alias(pd.get_dummies, append=True) def factor(): pass - def _standardize(self, cols, demean=True, rescale=True, copy=True): - cols = self._select_cols(cols) - X = self.data[cols] - return (X - X.mean()) / np.std(X, 0) + @alias(Transformations.standardize) + def standardize(): pass + + @alias(Transformations.binarize) + def binarize(): pass + + @alias(Transformations.orthogonalize) + def orthogonalize(): pass + ### Standard instance methods ### def select(self, cols): # Always retain onsets if 'onset' not in cols: cols.insert(0, 'onset') self.data = self.data[self._select_cols(cols)] - @alias(_standardize) - def standardize(): pass - - def _binarize(self, cols, threshold=0.0): - cols = self._select_cols(cols) - X = self.data[cols].values - above = X > threshold - X[above] = 1 - X[~above] = 0 - return X - - @alias(_binarize) - def binarize(): pass - - def _orthogonalize(self, cols, X_cols): - ''' Orthogonalize each of the variables in cols with respect to all - of the variables in x_cols. - ''' - cols, X_cols = self._select_cols(cols), self._select_cols(X_cols) - X = self.data[X_cols].values - y = self.data[cols].values - _aX = np.c_[np.ones(len(y)), X] - coefs, resids, rank, s = np.linalg.lstsq(_aX, y) - return y - X.dot(coefs[1:]) - - @alias(_orthogonalize) - def orthogonalize(): pass - def formula(self, f, target=None, replace=False, *args, **kwargs): from patsy import dmatrix result = dmatrix(f, self.data, return_type='dataframe', *args, **kwargs) @@ -156,6 +159,12 @@ def apply(self, func, *args, **kwargs): func = getattr(self, func) func(*args, **kwargs) + def _select_cols(self, cols): + if isinstance(cols, string_types) and '*' in cols: + patt = re.compile(cols.replace('*', '.*')) + cols = [l for l in self.data.columns.tolist() for m in [patt.search(l)] if m] + return cols + def _to_dense(self): """ Convert the sparse [onset, duration, amplitude] representation typical of event files to a dense matrix where each row represents @@ -352,8 +361,6 @@ def _transform_events(self): name = t.pop('name') cols = t.pop('input', None) self.transformer.apply(name, cols, **t) - if 'select' in tf: - self.transformer.select(tf['select']) self.transformer.resample(self.inputs.time_repetition)