From 2e61f9cab45bfa05142d91e81ed0133bc0324310 Mon Sep 17 00:00:00 2001 From: rockg Date: Wed, 20 Jan 2016 07:16:40 -0500 Subject: [PATCH] PEP: pandas/core round 7 (window, reshape, series, format, minor categorical) --- pandas/core/categorical.py | 10 +- pandas/core/format.py | 477 ++++++++++++++++++------------------- pandas/core/reshape.py | 95 ++++---- pandas/core/series.py | 306 ++++++++++++------------ pandas/core/window.py | 333 +++++++++++++++----------- 5 files changed, 639 insertions(+), 582 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index abc9e58d7c435..8a6ea69058c7e 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -14,11 +14,11 @@ from pandas.util.decorators import cache_readonly, deprecate_kwarg from pandas.core.common import ( - ABCSeries, ABCIndexClass, ABCPeriodIndex, ABCCategoricalIndex, isnull, - notnull, is_dtype_equal, is_categorical_dtype, is_integer_dtype, - is_object_dtype, _possibly_infer_to_datetimelike, get_dtype_kinds, - is_list_like, is_sequence, is_null_slice, is_bool, _ensure_platform_int, - _ensure_object, _ensure_int64, _coerce_indexer_dtype, take_1d) + ABCSeries, ABCIndexClass, ABCCategoricalIndex, isnull, notnull, + is_dtype_equal, is_categorical_dtype, is_integer_dtype, + _possibly_infer_to_datetimelike, get_dtype_kinds, is_list_like, + is_sequence, is_null_slice, is_bool, _ensure_object, _ensure_int64, + _coerce_indexer_dtype, take_1d) from pandas.core.dtypes import CategoricalDtype from pandas.util.terminal import get_terminal_size from pandas.core.config import get_option diff --git a/pandas/core/format.py b/pandas/core/format.py index a50edd9462431..10b67d6229234 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -6,11 +6,11 @@ import sys from pandas.core.base import PandasObject -from pandas.core.common import adjoin, isnull, notnull +from pandas.core.common import isnull, notnull from pandas.core.index import Index, MultiIndex, _ensure_index from pandas import compat -from pandas.compat import(StringIO, lzip, range, map, zip, reduce, u, - OrderedDict) +from pandas.compat import (StringIO, lzip, range, map, zip, reduce, u, + OrderedDict) from pandas.util.terminal import get_terminal_size from pandas.core.config import get_option, set_option from pandas.io.common import _get_handle, UnicodeWriter, _expand_user @@ -54,7 +54,7 @@ index_names : bool, optional Prints the names of the indexes, default True""" -justify_docstring = """ +justify_docstring = """ justify : {'left', 'right'}, default None Left or right-justify the column labels. If None uses the option from the print configuration (controlled by set_option), 'right' out @@ -68,10 +68,10 @@ docstring_to_string = common_docstring + justify_docstring + return_docstring -class CategoricalFormatter(object): - def __init__(self, categorical, buf=None, length=True, - na_rep='NaN', footer=True): +class CategoricalFormatter(object): + def __init__(self, categorical, buf=None, length=True, na_rep='NaN', + footer=True): self.categorical = categorical self.buf = buf if buf is not None else StringIO(u("")) self.na_rep = na_rep @@ -97,8 +97,7 @@ def _get_footer(self): def _get_formatted_values(self): return format_array(self.categorical.get_values(), None, - float_format=None, - na_rep=self.na_rep) + float_format=None, na_rep=self.na_rep) def to_string(self): categorical = self.categorical @@ -114,7 +113,7 @@ def to_string(self): result = ['%s' % i for i in fmt_values] result = [i.strip() for i in result] result = u(', ').join(result) - result = [u('[')+result+u(']')] + result = [u('[') + result + u(']')] if self.footer: footer = self._get_footer() if footer: @@ -124,10 +123,9 @@ def to_string(self): class SeriesFormatter(object): - - def __init__(self, series, buf=None, length=True, header=True, - index=True, na_rep='NaN', name=False, float_format=None, - dtype=True, max_rows=None): + def __init__(self, series, buf=None, length=True, header=True, index=True, + na_rep='NaN', name=False, float_format=None, dtype=True, + max_rows=None): self.series = series self.buf = buf if buf is not None else StringIO() self.name = name @@ -156,7 +154,8 @@ def _chk_truncate(self): series = series.iloc[:max_rows] else: row_num = max_rows // 2 - series = concat((series.iloc[:row_num], series.iloc[-row_num:])) + series = concat((series.iloc[:row_num], + series.iloc[-row_num:])) self.tr_row_num = row_num self.tr_series = series self.truncate_v = truncate_v @@ -174,8 +173,7 @@ def _get_footer(self): series_name = com.pprint_thing(name, escape_chars=('\t', '\r', '\n')) - footer += ("Name: %s" % - series_name) if name is not None else "" + footer += ("Name: %s" % series_name) if name is not None else "" if self.length: if footer: @@ -189,7 +187,8 @@ def _get_footer(self): footer += ', ' footer += 'dtype: %s' % com.pprint_thing(name) - # level infos are added to the end and in a new line, like it is done for Categoricals + # level infos are added to the end and in a new line, like it is done + # for Categoricals if com.is_categorical_dtype(self.tr_series.dtype): level_info = self.tr_series._values._repr_categories_info() if footer: @@ -212,8 +211,7 @@ def _get_formatted_index(self): def _get_formatted_values(self): return format_array(self.tr_series._values, None, - float_format=self.float_format, - na_rep=self.na_rep) + float_format=self.float_format, na_rep=self.na_rep) def to_string(self): series = self.tr_series @@ -225,13 +223,10 @@ def to_string(self): fmt_index, have_header = self._get_formatted_index() fmt_values = self._get_formatted_values() - maxlen = max(self.adj.len(x) for x in fmt_index) # max index len - pad_space = min(maxlen, 60) - if self.truncate_v: n_header_rows = 0 row_num = self.tr_row_num - width = self.adj.len(fmt_values[row_num-1]) + width = self.adj.len(fmt_values[row_num - 1]) if width > 3: dot_str = '...' else: @@ -258,7 +253,6 @@ def to_string(self): class TextAdjustment(object): - def __init__(self): self.encoding = get_option("display.encoding") @@ -274,7 +268,6 @@ def adjoin(self, space, *lists, **kwargs): class EastAsianTextAdjustment(TextAdjustment): - def __init__(self): super(EastAsianTextAdjustment, self).__init__() if get_option("display.unicode.ambiguous_as_wide"): @@ -313,8 +306,8 @@ class TableFormatter(object): @property def should_show_dimensions(self): - return self.show_dimensions is True or (self.show_dimensions == 'truncate' and - self.is_truncated) + return (self.show_dimensions is True or + (self.show_dimensions == 'truncate' and self.is_truncated)) def _get_formatter(self, i): if isinstance(self.formatters, (list, tuple)): @@ -329,7 +322,6 @@ def _get_formatter(self, i): class DataFrameFormatter(TableFormatter): - """ Render a DataFrame @@ -386,10 +378,10 @@ def __init__(self, frame, buf=None, columns=None, col_space=None, self.adj = _get_adjustment() def _chk_truncate(self): - ''' + """ Checks whether the frame should be truncated. If so, slices the frame up. - ''' + """ from pandas.tools.merge import concat # Column of which first element is used to determine width of a dot col @@ -399,7 +391,8 @@ def _chk_truncate(self): max_cols = self.max_cols max_rows = self.max_rows - if max_cols == 0 or max_rows == 0: # assume we are in the terminal (why else = 0) + if max_cols == 0 or max_rows == 0: # assume we are in the terminal + # (why else = 0) (w, h) = get_terminal_size() self.w = w self.h = h @@ -408,11 +401,14 @@ def _chk_truncate(self): prompt_row = 1 if self.show_dimensions: show_dimension_rows = 3 - n_add_rows = self.header + dot_row + show_dimension_rows + prompt_row - max_rows_adj = self.h - n_add_rows # rows available to fill with actual data + n_add_rows = (self.header + dot_row + show_dimension_rows + + prompt_row) + # rows available to fill with actual data + max_rows_adj = self.h - n_add_rows self.max_rows_adj = max_rows_adj - # Format only rows and columns that could potentially fit the screen + # Format only rows and columns that could potentially fit the + # screen if max_cols == 0 and len(self.frame.columns) > w: max_cols = w if max_rows == 0 and len(self.frame) > h: @@ -438,7 +434,8 @@ def _chk_truncate(self): col_num = max_cols else: col_num = (max_cols_adj // 2) - frame = concat((frame.iloc[:, :col_num], frame.iloc[:, -col_num:]), axis=1) + frame = concat((frame.iloc[:, :col_num], + frame.iloc[:, -col_num:]), axis=1) self.tr_col_num = col_num if truncate_v: if max_rows_adj == 0: @@ -448,7 +445,8 @@ def _chk_truncate(self): frame = frame.iloc[:max_rows, :] else: row_num = max_rows_adj // 2 - frame = concat((frame.iloc[:row_num, :], frame.iloc[-row_num:, :])) + frame = concat((frame.iloc[:row_num, :], + frame.iloc[-row_num:, :])) self.tr_row_num = row_num self.tr_frame = frame @@ -471,8 +469,8 @@ def _to_str_columns(self): stringified = [] for i, c in enumerate(frame): cheader = str_columns[i] - max_colwidth = max(self.col_space or 0, - *(self.adj.len(x) for x in cheader)) + max_colwidth = max(self.col_space or 0, *(self.adj.len(x) + for x in cheader)) fmt_values = self._format_col(i) fmt_values = _make_fixed_width(fmt_values, self.justify, minimum=max_colwidth, @@ -502,13 +500,16 @@ def _to_str_columns(self): if truncate_h: col_num = self.tr_col_num - col_width = self.adj.len(strcols[self.tr_size_col][0]) # infer from column header - strcols.insert(self.tr_col_num + 1, ['...'.center(col_width)] * (len(str_index))) + # infer from column header + col_width = self.adj.len(strcols[self.tr_size_col][0]) + strcols.insert(self.tr_col_num + 1, ['...'.center(col_width)] * + (len(str_index))) if truncate_v: n_header_rows = len(str_index) - len(frame) row_num = self.tr_row_num for ix, col in enumerate(strcols): - cwidth = self.adj.len(strcols[ix][row_num]) # infer from above row + # infer from above row + cwidth = self.adj.len(strcols[ix][row_num]) is_dot_col = False if truncate_h: is_dot_col = ix == col_num + 1 @@ -537,16 +538,18 @@ def to_string(self): frame = self.frame if len(frame.columns) == 0 or len(frame.index) == 0: - info_line = (u('Empty %s\nColumns: %s\nIndex: %s') - % (type(self.frame).__name__, - com.pprint_thing(frame.columns), - com.pprint_thing(frame.index))) + info_line = (u('Empty %s\nColumns: %s\nIndex: %s') % + (type(self.frame).__name__, + com.pprint_thing(frame.columns), + com.pprint_thing(frame.index))) text = info_line else: strcols = self._to_str_columns() - if self.line_width is None: # no need to wrap around just print the whole frame + if self.line_width is None: # no need to wrap around just print + # the whole frame text = self.adj.adjoin(1, *strcols) - elif not isinstance(self.max_cols, int) or self.max_cols > 0: # need to wrap around + elif (not isinstance(self.max_cols, int) or + self.max_cols > 0): # need to wrap around text = self._join_multiline(*strcols) else: # max_cols == 0. Try to fit frame to terminal text = self.adj.adjoin(1, *strcols).split('\n') @@ -554,12 +557,15 @@ def to_string(self): max_len_col_ix = np.argmax(row_lens) max_len = row_lens[max_len_col_ix] headers = [ele[0] for ele in strcols] - # Size of last col determines dot col size. See `self._to_str_columns + # Size of last col determines dot col size. See + # `self._to_str_columns size_tr_col = len(headers[self.tr_size_col]) - max_len += size_tr_col # Need to make space for largest row plus truncate dot col + max_len += size_tr_col # Need to make space for largest row + # plus truncate dot col dif = max_len - self.w adj_dif = dif - col_lens = Series([Series(ele).apply(len).max() for ele in strcols]) + col_lens = Series([Series(ele).apply(len).max() + for ele in strcols]) n_cols = len(col_lens) counter = 0 while adj_dif > 0 and n_cols > 1: @@ -583,8 +589,8 @@ def to_string(self): self.buf.writelines(text) if self.should_show_dimensions: - self.buf.write("\n\n[%d rows x %d columns]" - % (len(frame), len(frame.columns))) + self.buf.write("\n\n[%d rows x %d columns]" % + (len(frame), len(frame.columns))) def _join_multiline(self, *strcols): lwidth = self.line_width @@ -592,11 +598,11 @@ def _join_multiline(self, *strcols): strcols = list(strcols) if self.index: idx = strcols.pop(0) - lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width + lwidth -= np.array([self.adj.len(x) + for x in idx]).max() + adjoin_width - col_widths = [np.array([self.adj.len(x) for x in col]).max() - if len(col) > 0 else 0 - for col in strcols] + col_widths = [np.array([self.adj.len(x) for x in col]).max() if + len(col) > 0 else 0 for col in strcols] col_bins = _binify(col_widths, lwidth) nbins = len(col_bins) @@ -640,11 +646,9 @@ def to_latex(self, column_format=None, longtable=False, encoding=None): def _format_col(self, i): frame = self.tr_frame formatter = self._get_formatter(i) - return format_array( - frame.iloc[:, i]._values, - formatter, float_format=self.float_format, na_rep=self.na_rep, - space=self.col_space - ) + return format_array(frame.iloc[:, i]._values, formatter, + float_format=self.float_format, na_rep=self.na_rep, + space=self.col_space) def to_html(self, classes=None, notebook=False): """ @@ -687,11 +691,13 @@ def is_numeric_dtype(dtype): need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) def space_format(x, y): - if y not in self.formatters and need_leadsp[x] and not restrict_formatting: + if (y not in self.formatters and + need_leadsp[x] and not restrict_formatting): return ' ' + y return y - str_columns = list(zip(*[[space_format(x, y) for y in x] for x in fmt_columns])) + str_columns = list(zip(*[[space_format(x, y) for y in x] + for x in fmt_columns])) if self.sparsify: str_columns = _sparsify(str_columns) @@ -700,11 +706,10 @@ def space_format(x, y): fmt_columns = columns.format() dtypes = self.frame.dtypes need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) - str_columns = [[' ' + x - if not self._get_formatter(i) and need_leadsp[x] - else x] - for i, (col, x) in - enumerate(zip(columns, fmt_columns))] + str_columns = [[' ' + x if not self._get_formatter(i) and + need_leadsp[x] else x] + for i, (col, x) in enumerate(zip(columns, + fmt_columns))] if self.show_index_names and self.has_index_names: for x in str_columns: @@ -722,7 +727,8 @@ def has_column_names(self): return _has_names(self.frame.columns) def _get_formatted_index(self, frame): - # Note: this is only used by to_string() and to_latex(), not by to_html(). + # Note: this is only used by to_string() and to_latex(), not by + # to_html(). index = frame.index columns = frame.columns @@ -733,14 +739,12 @@ def _get_formatted_index(self, frame): if isinstance(index, MultiIndex): fmt_index = index.format(sparsify=self.sparsify, adjoin=False, - names=show_index_names, - formatter=fmt) + names=show_index_names, formatter=fmt) else: fmt_index = [index.format(name=show_index_names, formatter=fmt)] fmt_index = [tuple(_make_fixed_width(list(x), justify='left', minimum=(self.col_space or 0), - adj=self.adj)) - for x in fmt_index] + adj=self.adj)) for x in fmt_index] adjoined = self.adj.adjoin(1, *fmt_index).split('\n') @@ -797,9 +801,9 @@ def write_result(self, buf): # string representation of the columns if len(self.frame.columns) == 0 or len(self.frame.index) == 0: - info_line = (u('Empty %s\nColumns: %s\nIndex: %s') - % (type(self.frame).__name__, - self.frame.columns, self.frame.index)) + info_line = (u('Empty %s\nColumns: %s\nIndex: %s') % + (type(self.frame).__name__, self.frame.columns, + self.frame.index)) strcols = [[info_line]] else: strcols = self.fmt._to_str_columns() @@ -862,16 +866,12 @@ def get_col_type(dtype): buf.write('\\endlastfoot\n') if self.fmt.kwds.get('escape', True): # escape backslashes first - crow = [(x.replace('\\', '\\textbackslash') - .replace('_', '\\_') - .replace('%', '\\%') - .replace('$', '\\$') - .replace('#', '\\#') - .replace('{', '\\{') - .replace('}', '\\}') - .replace('~', '\\textasciitilde') - .replace('^', '\\textasciicircum') - .replace('&', '\\&') if x else '{}') for x in row] + crow = [(x.replace('\\', '\\textbackslash').replace('_', '\\_') + .replace('%', '\\%').replace('$', '\\$') + .replace('#', '\\#').replace('{', '\\{') + .replace('}', '\\}').replace('~', '\\textasciitilde') + .replace('^', '\\textasciicircum').replace('&', '\\&') + if x else '{}') for x in row] else: crow = [x if x else '{}' for x in row] buf.write(' & '.join(crow)) @@ -911,8 +911,7 @@ def write(self, s, indent=0): self.elements.append(' ' * indent + rs) def write_th(self, s, indent=0, tags=None): - if (self.fmt.col_space is not None - and self.fmt.col_space > 0): + if self.fmt.col_space is not None and self.fmt.col_space > 0: tags = (tags or "") tags += 'style="min-width: %s;"' % self.fmt.col_space @@ -929,14 +928,12 @@ def _write_cell(self, s, kind='td', indent=0, tags=None): if self.escape: # escape & first to prevent double escaping of & - esc = OrderedDict( - [('&', r'&'), ('<', r'<'), ('>', r'>')] - ) + esc = OrderedDict([('&', r'&'), ('<', r'<'), + ('>', r'>')]) else: esc = {} rs = com.pprint_thing(s, escape_chars=esc).strip() - self.write( - '%s%s' % (start_tag, rs, kind), indent) + self.write('%s%s' % (start_tag, rs, kind), indent) def write_tr(self, line, indent=0, indent_delta=4, header=False, align=None, tags=None, nindex_levels=0): @@ -968,8 +965,8 @@ def write_result(self, buf): if isinstance(self.classes, str): self.classes = self.classes.split() if not isinstance(self.classes, (list, tuple)): - raise AssertionError(('classes must be list or tuple, ' - 'not %s') % type(self.classes)) + raise AssertionError('classes must be list or tuple, ' + 'not %s' % type(self.classes)) _classes.extend(self.classes) if self.notebook: @@ -1020,8 +1017,8 @@ def _column_header(): else: row.append('') style = "text-align: %s;" % self.fmt.justify - row.extend([single_column_table(c, self.fmt.justify, style) for - c in self.columns]) + row.extend([single_column_table(c, self.fmt.justify, style) + for c in self.columns]) else: if self.fmt.index: row.append(self.columns.name or '') @@ -1041,8 +1038,8 @@ def _column_header(): sentinel = com.sentinel_factory() else: sentinel = None - levels = self.columns.format(sparsify=sentinel, - adjoin=False, names=False) + levels = self.columns.format(sparsify=sentinel, adjoin=False, + names=False) level_lengths = _get_level_lengths(levels, sentinel) inner_lvl = len(level_lengths) - 1 for lnum, (records, values) in enumerate(zip(level_lengths, @@ -1059,18 +1056,21 @@ def _column_header(): elif tag + span > ins_col: recs_new[tag] = span + 1 if lnum == inner_lvl: - values = values[:ins_col] + (u('...'),) + \ - values[ins_col:] - else: # sparse col headers do not receive a ... - values = (values[:ins_col] + (values[ins_col - 1],) + + values = (values[:ins_col] + (u('...'),) + + values[ins_col:]) + else: + # sparse col headers do not receive a ... + values = (values[:ins_col] + + (values[ins_col - 1], ) + values[ins_col:]) else: recs_new[tag] = span - # if ins_col lies between tags, all col headers get ... + # if ins_col lies between tags, all col headers + # get ... if tag + span == ins_col: recs_new[ins_col] = 1 - values = values[:ins_col] + (u('...'),) + \ - values[ins_col:] + values = (values[:ins_col] + (u('...'),) + + values[ins_col:]) records = recs_new inner_lvl = len(level_lengths) - 1 if lnum == inner_lvl: @@ -1084,11 +1084,12 @@ def _column_header(): recs_new[tag] = span recs_new[ins_col] = 1 records = recs_new - values = values[:ins_col] + [u('...')] + values[ins_col:] + values = (values[:ins_col] + [u('...')] + + values[ins_col:]) name = self.columns.names[lnum] - row = [''] * (row_levels - 1) + ['' if name is None - else com.pprint_thing(name)] + row = [''] * (row_levels - 1) + ['' if name is None else + com.pprint_thing(name)] if row == [""] and self.fmt.index is False: row = [] @@ -1117,9 +1118,9 @@ def _column_header(): align=align) if self.fmt.has_index_names and self.fmt.index: - row = [ - x if x is not None else '' for x in self.frame.index.names - ] + [''] * min(len(self.columns), self.max_cols) + row = ([x if x is not None else '' + for x in self.frame.index.names] + + [''] * min(len(self.columns), self.max_cols)) if truncate_h: ins_col = row_levels + self.fmt.tr_col_num row.insert(ins_col, '') @@ -1172,8 +1173,8 @@ def _write_regular_rows(self, fmt_values, indent): if truncate_v and i == (self.fmt.tr_row_num): str_sep_row = ['...' for ele in row] - self.write_tr(str_sep_row, indent, self.indent_delta, tags=None, - nindex_levels=1) + self.write_tr(str_sep_row, indent, self.indent_delta, + tags=None, nindex_levels=1) row = [] row.append(index_values[i]) @@ -1195,13 +1196,15 @@ def _write_hierarchical_rows(self, fmt_values, indent): nrows = len(frame) row_levels = self.frame.index.nlevels - idx_values = frame.index.format(sparsify=False, adjoin=False, names=False) + idx_values = frame.index.format(sparsify=False, adjoin=False, + names=False) idx_values = lzip(*idx_values) if self.fmt.sparsify: # GH3547 sentinel = com.sentinel_factory() - levels = frame.index.format(sparsify=sentinel, adjoin=False, names=False) + levels = frame.index.format(sparsify=sentinel, adjoin=False, + names=False) level_lengths = _get_level_lengths(levels, sentinel) inner_lvl = len(level_lengths) - 1 @@ -1221,11 +1224,13 @@ def _write_hierarchical_rows(self, fmt_values, indent): idx_values.insert(ins_row, tuple(dot_row)) else: rec_new[tag] = span - # If ins_row lies between tags, all cols idx cols receive ... + # If ins_row lies between tags, all cols idx cols + # receive ... if tag + span == ins_row: rec_new[ins_row] = 1 if lnum == 0: - idx_values.insert(ins_row, tuple([u('...')]*len(level_lengths))) + idx_values.insert(ins_row, tuple( + [u('...')] * len(level_lengths))) level_lengths[lnum] = rec_new level_lengths[inner_lvl][ins_row] = 1 @@ -1252,14 +1257,14 @@ def _write_hierarchical_rows(self, fmt_values, indent): row.extend(fmt_values[j][i] for j in range(ncols)) if truncate_h: - row.insert(row_levels - sparse_offset + self.fmt.tr_col_num, '...') + row.insert(row_levels - sparse_offset + + self.fmt.tr_col_num, '...') self.write_tr(row, indent, self.indent_delta, tags=tags, nindex_levels=len(levels) - sparse_offset) else: for i in range(len(frame)): - idx_values = list(zip(*frame.index.format(sparsify=False, - adjoin=False, - names=False))) + idx_values = list(zip(*frame.index.format( + sparsify=False, adjoin=False, names=False))) row = [] row.extend(idx_values[i]) row.extend(fmt_values[j][i] for j in range(ncols)) @@ -1279,6 +1284,7 @@ def grouper(x): if x != sentinel: record['count'] += 1 return record['count'] + return grouper result = [] @@ -1297,18 +1303,18 @@ def grouper(x): class CSVFormatter(object): - - def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, - cols=None, header=True, index=True, index_label=None, - mode='w', nanRep=None, encoding=None, compression=None, quoting=None, - line_terminator='\n', chunksize=None, engine=None, - tupleize_cols=False, quotechar='"', date_format=None, - doublequote=True, escapechar=None, decimal='.'): + def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', + float_format=None, cols=None, header=True, index=True, + index_label=None, mode='w', nanRep=None, encoding=None, + compression=None, quoting=None, line_terminator='\n', + chunksize=None, engine=None, tupleize_cols=False, + quotechar='"', date_format=None, doublequote=True, + escapechar=None, decimal='.'): if engine is not None: - warnings.warn("'engine' keyword is deprecated and " - "will be removed in a future version", - FutureWarning, stacklevel=3) + warnings.warn("'engine' keyword is deprecated and will be " + "removed in a future version", FutureWarning, + stacklevel=3) self.engine = engine # remove for 0.18 self.obj = obj @@ -1350,8 +1356,8 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, "supported with engine='python'") self.tupleize_cols = tupleize_cols - self.has_mi_columns = isinstance(obj.columns, MultiIndex - ) and not self.tupleize_cols + self.has_mi_columns = (isinstance(obj.columns, MultiIndex) and + not self.tupleize_cols) # validate mi options if self.has_mi_columns: @@ -1398,9 +1404,8 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, if (isinstance(self.data_index, DatetimeIndex) and date_format is not None): - self.data_index = Index([x.strftime(date_format) - if notnull(x) else '' - for x in self.data_index]) + self.data_index = Index([x.strftime(date_format) if notnull(x) else + '' for x in self.data_index]) self.nlevels = getattr(self.data_index, 'nlevels', 1) if not index: @@ -1408,9 +1413,9 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, # original python implem. of df.to_csv # invoked by df.to_csv(engine=python) - def _helper_csv(self, writer, na_rep=None, cols=None, - header=True, index=True, - index_label=None, float_format=None, date_format=None): + def _helper_csv(self, writer, na_rep=None, cols=None, header=True, + index=True, index_label=None, float_format=None, + date_format=None): if cols is None: cols = self.columns @@ -1459,6 +1464,7 @@ def _helper_csv(self, writer, na_rep=None, cols=None, if date_format is None: date_formatter = lambda x: Timestamp(x)._repr_base else: + def strftime_with_nulls(x): x = Timestamp(x) if notnull(x): @@ -1477,9 +1483,7 @@ def strftime_with_nulls(x): values = self.obj.copy() values.index = data_index values.columns = values.columns.to_native_types( - na_rep=na_rep, - float_format=float_format, - date_format=date_format, + na_rep=na_rep, float_format=float_format, date_format=date_format, quoting=self.quoting) values = values[cols] @@ -1516,8 +1520,8 @@ def save(self): close = False else: f = _get_handle(self.path_or_buf, self.mode, - encoding=self.encoding, - compression=self.compression) + encoding=self.encoding, + compression=self.compression) close = True try: @@ -1586,7 +1590,8 @@ def _save_header(self): index_label = [''] else: index_label = [index_label] - elif not isinstance(index_label, (list, tuple, np.ndarray, Index)): + elif not isinstance(index_label, + (list, tuple, np.ndarray, Index)): # given a string for a DF with Index index_label = [index_label] @@ -1652,8 +1657,7 @@ def _save_chunk(self, start_i, end_i): slicer = slice(start_i, end_i) for i in range(len(self.blocks)): b = self.blocks[i] - d = b.to_native_types(slicer=slicer, - na_rep=self.na_rep, + d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format, decimal=self.decimal, date_format=self.date_format, @@ -1663,8 +1667,7 @@ def _save_chunk(self, start_i, end_i): # self.data is a preallocated list self.data[col_loc] = col - ix = data_index.to_native_types(slicer=slicer, - na_rep=self.na_rep, + ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format, decimal=self.decimal, date_format=self.date_format, @@ -1681,8 +1684,8 @@ class ExcelCell(object): __fields__ = ('row', 'col', 'val', 'style', 'mergestart', 'mergeend') __slots__ = __fields__ - def __init__(self, row, col, val, - style=None, mergestart=None, mergeend=None): + def __init__(self, row, col, val, style=None, mergestart=None, + mergeend=None): self.row = row self.col = col self.val = val @@ -1696,11 +1699,11 @@ def __init__(self, row, col, val, "right": "thin", "bottom": "thin", "left": "thin"}, - "alignment": {"horizontal": "center", "vertical": "top"}} + "alignment": {"horizontal": "center", + "vertical": "top"}} class ExcelFormatter(object): - """ Class for formatting a DataFrame to a list of ExcelCells, @@ -1761,15 +1764,17 @@ def _format_header_mi(self): if self.columns.nlevels > 1: if not self.index: raise NotImplementedError("Writing to Excel with MultiIndex" - " columns and no index ('index'=False) " - "is not yet implemented.") + " columns and no index " + "('index'=False) is not yet " + "implemented.") has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index)) - if not(has_aliases or self.header): + if not (has_aliases or self.header): return columns = self.columns - level_strs = columns.format(sparsify=self.merge_cells, adjoin=False, names=False) + level_strs = columns.format(sparsify=self.merge_cells, adjoin=False, + names=False) level_lengths = _get_level_lengths(level_strs) coloffset = 0 lnum = 0 @@ -1783,23 +1788,16 @@ def _format_header_mi(self): name = columns.names[lnum] yield ExcelCell(lnum, coloffset, name, header_style) - for lnum, (spans, levels, labels) in enumerate(zip(level_lengths, - columns.levels, - columns.labels) - ): + for lnum, (spans, levels, labels) in enumerate(zip( + level_lengths, columns.levels, columns.labels)): values = levels.take(labels) for i in spans: if spans[i] > 1: - yield ExcelCell(lnum, - coloffset + i + 1, - values[i], - header_style, - lnum, + yield ExcelCell(lnum, coloffset + i + 1, values[i], + header_style, lnum, coloffset + i + spans[i]) else: - yield ExcelCell(lnum, - coloffset + i + 1, - values[i], + yield ExcelCell(lnum, coloffset + i + 1, values[i], header_style) else: # Format in legacy format with dots to indicate levels. @@ -1822,8 +1820,8 @@ def _format_header_regular(self): colnames = self.columns if has_aliases: if len(self.header) != len(self.columns): - raise ValueError(('Writing %d cols but got %d aliases' - % (len(self.columns), len(self.header)))) + raise ValueError('Writing %d cols but got %d aliases' % + (len(self.columns), len(self.header))) else: colnames = self.header @@ -1864,8 +1862,9 @@ def _format_regular_rows(self): if self.index: # chek aliases # if list only take first as this is not a MultiIndex - if self.index_label and isinstance(self.index_label, - (list, tuple, np.ndarray, Index)): + if (self.index_label and + isinstance(self.index_label, (list, tuple, np.ndarray, + Index))): index_label = self.index_label[0] # if string good to go elif self.index_label and isinstance(self.index_label, str): @@ -1877,9 +1876,7 @@ def _format_regular_rows(self): self.rowcounter += 1 if index_label and self.header is not False: - yield ExcelCell(self.rowcounter - 1, - 0, - index_label, + yield ExcelCell(self.rowcounter - 1, 0, index_label, header_style) # write index_values @@ -1907,8 +1904,9 @@ def _format_hierarchical_rows(self): if self.index: index_labels = self.df.index.names # check for aliases - if self.index_label and isinstance(self.index_label, - (list, tuple, np.ndarray, Index)): + if (self.index_label and + isinstance(self.index_label, (list, tuple, np.ndarray, + Index))): index_labels = self.index_label # MultiIndex columns require an extra row @@ -1919,13 +1917,11 @@ def _format_hierarchical_rows(self): self.rowcounter += 1 # if index labels are not empty go ahead and dump - if (any(x is not None for x in index_labels) - and self.header is not False): + if (any(x is not None for x in index_labels) and + self.header is not False): for cidx, name in enumerate(index_labels): - yield ExcelCell(self.rowcounter - 1, - cidx, - name, + yield ExcelCell(self.rowcounter - 1, cidx, name, header_style) if self.merge_cells: @@ -1940,27 +1936,21 @@ def _format_hierarchical_rows(self): values = levels.take(labels) for i in spans: if spans[i] > 1: - yield ExcelCell(self.rowcounter + i, - gcolidx, - values[i], - header_style, + yield ExcelCell(self.rowcounter + i, gcolidx, + values[i], header_style, self.rowcounter + i + spans[i] - 1, gcolidx) else: - yield ExcelCell(self.rowcounter + i, - gcolidx, - values[i], - header_style) + yield ExcelCell(self.rowcounter + i, gcolidx, + values[i], header_style) gcolidx += 1 else: # Format hierarchical rows with non-merged values. for indexcolvals in zip(*self.df.index): for idx, indexcolval in enumerate(indexcolvals): - yield ExcelCell(self.rowcounter + idx, - gcolidx, - indexcolval, - header_style) + yield ExcelCell(self.rowcounter + idx, gcolidx, + indexcolval, header_style) gcolidx += 1 # Write the body of the frame data series by series. @@ -2009,18 +1999,16 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', digits = get_option("display.precision") fmt_obj = fmt_klass(values, digits=digits, na_rep=na_rep, - float_format=float_format, - formatter=formatter, space=space, - justify=justify) + float_format=float_format, formatter=formatter, + space=space, justify=justify) return fmt_obj.get_result() class GenericArrayFormatter(object): - def __init__(self, values, digits=7, formatter=None, na_rep='NaN', - space=12, float_format=None, justify='right', - decimal='.', quoting=None): + space=12, float_format=None, justify='right', decimal='.', + quoting=None): self.values = values self.digits = digits self.na_rep = na_rep @@ -2044,8 +2032,9 @@ def _format_strings(self): else: float_format = self.float_format - formatter = self.formatter if self.formatter is not None else \ - (lambda x: com.pprint_thing(x, escape_chars=('\t', '\r', '\n'))) + formatter = ( + self.formatter if self.formatter is not None else + (lambda x: com.pprint_thing(x, escape_chars=('\t', '\r', '\n')))) def _format(x): if self.na_rep is not None and lib.checknull(x): @@ -2080,7 +2069,6 @@ def _format(x): class FloatArrayFormatter(GenericArrayFormatter): - """ """ @@ -2128,7 +2116,7 @@ def _format_strings(self): # this is pretty arbitrary for now has_large_values = (abs_vals > 1e8).any() - has_small_values = ((abs_vals < 10 ** (-self.digits)) & + has_small_values = ((abs_vals < 10**(-self.digits)) & (abs_vals > 0)).any() if too_long and has_large_values: @@ -2152,8 +2140,9 @@ def get_formatted_data(self): mask = isnull(values) # the following variable is to be applied on each value to format it - # according to the string containing the float format, self.float_format - # and the character to use as decimal separator, self.decimal + # according to the string containing the float format, + # self.float_format and the character to use as decimal separator, + # self.decimal formatter = None if self.float_format and self.decimal != '.': formatter = lambda v: ( @@ -2171,14 +2160,13 @@ def get_formatted_data(self): values[mask] = self.na_rep if formatter: imask = (~mask).ravel() - values.flat[imask] = np.array( - [formatter(val) for val in values.ravel()[imask]]) + values.flat[imask] = np.array([formatter(val) + for val in values.ravel()[imask]]) return values class IntArrayFormatter(GenericArrayFormatter): - def _format_strings(self): formatter = self.formatter or (lambda x: '% d' % x) fmt_values = [formatter(x) for x in self.values] @@ -2198,14 +2186,15 @@ def _format_strings(self): if not isinstance(values, DatetimeIndex): values = DatetimeIndex(values) - fmt_values = format_array_from_datetime(values.asi8.ravel(), - format=_get_format_datetime64_from_values(values, self.date_format), - na_rep=self.nat_rep).reshape(values.shape) + fmt_values = format_array_from_datetime( + values.asi8.ravel(), + format=_get_format_datetime64_from_values(values, + self.date_format), + na_rep=self.nat_rep).reshape(values.shape) return fmt_values.tolist() class PeriodArrayFormatter(IntArrayFormatter): - def _format_strings(self): values = PeriodIndex(self.values).to_native_types() formatter = self.formatter or (lambda x: '%s' % x) @@ -2214,7 +2203,6 @@ def _format_strings(self): class CategoricalArrayFormatter(GenericArrayFormatter): - def __init__(self, values, *args, **kwargs): GenericArrayFormatter.__init__(self, values, *args, **kwargs) @@ -2235,11 +2223,13 @@ def _is_dates_only(values): values_int = values.asi8 consider_values = values_int != iNaT one_day_nanos = (86400 * 1e9) - even_days = np.logical_and(consider_values, values_int % one_day_nanos != 0).sum() == 0 + even_days = np.logical_and(consider_values, + values_int % one_day_nanos != 0).sum() == 0 if even_days: return True return False + def _format_datetime64(x, tz=None, nat_rep='NaT'): if x is None or lib.checknull(x): return nat_rep @@ -2262,12 +2252,12 @@ def _format_datetime64_dateonly(x, nat_rep='NaT', date_format=None): else: return x._date_repr + def _get_format_datetime64(is_dates_only, nat_rep='NaT', date_format=None): if is_dates_only: - return lambda x, tz=None: _format_datetime64_dateonly(x, - nat_rep=nat_rep, - date_format=date_format) + return lambda x, tz=None: _format_datetime64_dateonly( + x, nat_rep=nat_rep, date_format=date_format) else: return lambda x, tz=None: _format_datetime64(x, tz=tz, nat_rep=nat_rep) @@ -2281,27 +2271,29 @@ def _get_format_datetime64_from_values(values, date_format): class Datetime64TZFormatter(Datetime64Formatter): - def _format_strings(self): """ we by definition have a TZ """ values = self.values.asobject is_dates_only = _is_dates_only(values) - formatter = (self.formatter or _get_format_datetime64(is_dates_only, date_format=self.date_format)) - fmt_values = [ formatter(x) for x in values ] + formatter = (self.formatter or + _get_format_datetime64(is_dates_only, + date_format=self.date_format)) + fmt_values = [formatter(x) for x in values] return fmt_values -class Timedelta64Formatter(GenericArrayFormatter): +class Timedelta64Formatter(GenericArrayFormatter): def __init__(self, values, nat_rep='NaT', box=False, **kwargs): super(Timedelta64Formatter, self).__init__(values, **kwargs) self.nat_rep = nat_rep self.box = box def _format_strings(self): - formatter = self.formatter or _get_format_timedelta64(self.values, nat_rep=self.nat_rep, - box=self.box) + formatter = (self.formatter or + _get_format_timedelta64(self.values, nat_rep=self.nat_rep, + box=self.box)) fmt_values = np.array([formatter(x) for x in self.values]) return fmt_values @@ -2319,8 +2311,10 @@ def _get_format_timedelta64(values, nat_rep='NaT', box=False): consider_values = values_int != iNaT one_day_nanos = (86400 * 1e9) - even_days = np.logical_and(consider_values, values_int % one_day_nanos != 0).sum() == 0 - all_sub_day = np.logical_and(consider_values, np.abs(values_int) >= one_day_nanos).sum() == 0 + even_days = np.logical_and(consider_values, + values_int % one_day_nanos != 0).sum() == 0 + all_sub_day = np.logical_and( + consider_values, np.abs(values_int) >= one_day_nanos).sum() == 0 if even_days: format = 'even_day' @@ -2343,8 +2337,7 @@ def _formatter(x): return _formatter -def _make_fixed_width(strings, justify='right', minimum=None, - adj=None): +def _make_fixed_width(strings, justify='right', minimum=None, adj=None): if len(strings) == 0 or justify == 'all': return strings @@ -2381,7 +2374,7 @@ def _trim_zeros(str_floats, na_rep='NaN'): def _cond(values): non_na = [x for x in values if x != na_rep] return (len(non_na) > 0 and all([x.endswith('0') for x in non_na]) and - not(any([('e' in x) or ('E' in x) for x in non_na]))) + not (any([('e' in x) or ('E' in x) for x in non_na]))) while _cond(trimmed): trimmed = [x[:-1] if x != na_rep else x for x in trimmed] @@ -2417,8 +2410,7 @@ def _has_names(index): else: return index.name is not None - -# ------------------------------------------------------------------------------ +# ----------------------------------------------------------------------------- # Global formatting options _initial_defencoding = None @@ -2496,7 +2488,6 @@ def get_console_size(): class EngFormatter(object): - """ Formats float values according to engineering format. @@ -2576,7 +2567,7 @@ def __call__(self, num): else: prefix = 'E+%02d' % int_pow10 - mant = sign * dnum / (10 ** pow10) + mant = sign * dnum / (10**pow10) if self.accuracy is None: # pragma: no cover format_str = u("% g%s") @@ -2626,16 +2617,16 @@ def _binify(cols, line_width): bins.append(len(cols)) return bins + if __name__ == '__main__': arr = np.array([746.03, 0.00, 5620.00, 1592.36]) # arr = np.array([11111111.1, 1.55]) # arr = [314200.0034, 1.4125678] - arr = np.array([327763.3119, 345040.9076, 364460.9915, 398226.8688, - 383800.5172, 433442.9262, 539415.0568, 568590.4108, - 599502.4276, 620921.8593, 620898.5294, 552427.1093, - 555221.2193, 519639.7059, 388175.7, 379199.5854, - 614898.25, 504833.3333, 560600., 941214.2857, - 1134250., 1219550., 855736.85, 1042615.4286, - 722621.3043, 698167.1818, 803750.]) + arr = np.array( + [327763.3119, 345040.9076, 364460.9915, 398226.8688, 383800.5172, + 433442.9262, 539415.0568, 568590.4108, 599502.4276, 620921.8593, + 620898.5294, 552427.1093, 555221.2193, 519639.7059, 388175.7, + 379199.5854, 614898.25, 504833.3333, 560600., 941214.2857, 1134250., + 1219550., 855736.85, 1042615.4286, 722621.3043, 698167.1818, 803750.]) fmt = FloatArrayFormatter(arr, digits=7) print(fmt.get_result()) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 719f35dd90ce2..4dffaa0b0c416 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -24,7 +24,6 @@ class _Unstacker(object): - """ Helper class to unstack data / pivot with multi-level index @@ -159,10 +158,9 @@ def get_result(self): # may need to coerce categoricals here if self.is_categorical is not None: - values = [ Categorical.from_array(values[:,i], - categories=self.is_categorical.categories, - ordered=True) - for i in range(values.shape[-1]) ] + values = [Categorical.from_array( + values[:, i], categories=self.is_categorical.categories, + ordered=True) for i in range(values.shape[-1])] return DataFrame(values, index=index, columns=columns) @@ -188,8 +186,8 @@ def get_new_values(self): # is there a simpler / faster way of doing this? for i in range(values.shape[1]): - chunk = new_values[:, i * width: (i + 1) * width] - mask_chunk = new_mask[:, i * width: (i + 1) * width] + chunk = new_values[:, i * width:(i + 1) * width] + mask_chunk = new_mask[:, i * width:(i + 1) * width] chunk.flat[self.mask] = self.sorted_values[:, i] mask_chunk.flat[self.mask] = True @@ -233,10 +231,8 @@ def get_new_index(self): lev = lev.insert(len(lev), _get_na_value(lev.dtype.type)) return lev.take(lab) - return MultiIndex(levels=self.new_index_levels, - labels=result_labels, - names=self.new_index_names, - verify_integrity=False) + return MultiIndex(levels=self.new_index_levels, labels=result_labels, + names=self.new_index_names, verify_integrity=False) def _unstack_multiple(data, clocs): @@ -264,8 +260,8 @@ def _unstack_multiple(data, clocs): group_index = get_group_index(clabels, shape, sort=False, xnull=False) comp_ids, obs_ids = _compress_group_index(group_index, sort=False) - recons_labels = decons_obs_group_ids(comp_ids, - obs_ids, shape, clabels, xnull=False) + recons_labels = decons_obs_group_ids(comp_ids, obs_ids, shape, clabels, + xnull=False) dummy_index = MultiIndex(levels=rlevels + [obs_ids], labels=rlabels + [comp_ids], @@ -288,8 +284,7 @@ def _unstack_multiple(data, clocs): return result - dummy = DataFrame(data.values, index=dummy_index, - columns=data.columns) + dummy = DataFrame(data.values, index=dummy_index, columns=data.columns) unstacked = dummy.unstack('__placeholder__') if isinstance(unstacked, Series): @@ -329,8 +324,7 @@ def pivot(self, index=None, columns=None, values=None): else: index = self[index] indexed = Series(self[values].values, - index=MultiIndex.from_arrays([index, - self[columns]])) + index=MultiIndex.from_arrays([index, self[columns]])) return indexed.unstack(columns) @@ -461,6 +455,7 @@ def stack(frame, level=-1, dropna=True): ------- stacked : Series """ + def factorize(index): if index.is_unique: return index, np.arange(len(index)) @@ -492,11 +487,10 @@ def factorize(index): new_index = MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False) else: - levels, (ilab, clab) = \ - zip(*map(factorize, (frame.index, frame.columns))) + levels, (ilab, clab) = zip(*map(factorize, (frame.index, + frame.columns))) labels = ilab.repeat(K), np.tile(clab, N).ravel() - new_index = MultiIndex(levels=levels, - labels=labels, + new_index = MultiIndex(levels=levels, labels=labels, names=[frame.index.name, frame.columns.name], verify_integrity=False) @@ -541,8 +535,8 @@ def stack_multiple(frame, level, dropna=True): level = updated_level else: - raise ValueError("level should contain all level names or all level numbers, " - "not a mixture of the two.") + raise ValueError("level should contain all level names or all level " + "numbers, not a mixture of the two.") return result @@ -550,12 +544,12 @@ def stack_multiple(frame, level, dropna=True): def _stack_multi_columns(frame, level_num=-1, dropna=True): def _convert_level_number(level_num, columns): """ - Logic for converting the level number to something - we can safely pass to swaplevel: + Logic for converting the level number to something we can safely pass + to swaplevel: - We generally want to convert the level number into - a level name, except when columns do not have names, - in which case we must leave as a level number + We generally want to convert the level number into a level name, except + when columns do not have names, in which case we must leave as a level + number """ if level_num in columns.names: return columns.names[level_num] @@ -587,10 +581,9 @@ def _convert_level_number(level_num, columns): # tuple list excluding level for grouping columns if len(frame.columns.levels) > 2: - tuples = list(zip(*[ - lev.take(lab) for lev, lab in - zip(this.columns.levels[:-1], this.columns.labels[:-1]) - ])) + tuples = list(zip(*[lev.take(lab) + for lev, lab in zip(this.columns.levels[:-1], + this.columns.labels[:-1])])) unique_groups = [key for key, _ in itertools.groupby(tuples)] new_names = this.columns.names[:-1] new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) @@ -655,8 +648,8 @@ def _convert_level_number(level_num, columns): return result -def melt(frame, id_vars=None, value_vars=None, - var_name=None, value_name='value', col_level=None): +def melt(frame, id_vars=None, value_vars=None, var_name=None, + value_name='value', col_level=None): """ "Unpivots" a DataFrame from wide format to long format, optionally leaving identifier variables set. @@ -772,8 +765,8 @@ def melt(frame, id_vars=None, value_vars=None, if len(frame.columns.names) == len(set(frame.columns.names)): var_name = frame.columns.names else: - var_name = ['variable_%s' % i for i in - range(len(frame.columns.names))] + var_name = ['variable_%s' % i + for i in range(len(frame.columns.names))] else: var_name = [frame.columns.name if frame.columns.name is not None else 'variable'] @@ -922,6 +915,7 @@ def wide_to_long(df, stubnames, i, j): `pandas.melt` under the hood, but is hard-coded to "do the right thing" in a typicaly case. """ + def get_var_names(df, regex): return df.filter(regex=regex).columns.tolist() @@ -948,6 +942,7 @@ def melt_stub(df, stub, i, j): newdf = newdf.merge(new, how="outer", on=id_vars + [j], copy=False) return newdf.set_index([i, j]) + def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False): """ @@ -1026,21 +1021,20 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, # determine columns being encoded if columns is None: - columns_to_encode = data.select_dtypes(include=['object', - 'category']).columns + columns_to_encode = data.select_dtypes( + include=['object', 'category']).columns else: columns_to_encode = columns # validate prefixes and separator to avoid silently dropping cols def check_len(item, name): - length_msg = ("Length of '{0}' ({1}) did " - "not match the length of the columns " - "being encoded ({2}).") + length_msg = ("Length of '{0}' ({1}) did not match the length of " + "the columns being encoded ({2}).") if com.is_list_like(item): if not len(item) == len(columns_to_encode): raise ValueError(length_msg.format(name, len(item), - len(columns_to_encode))) + len(columns_to_encode))) check_len(prefix, 'prefix') check_len(prefix_sep, 'prefix_sep') @@ -1075,7 +1069,8 @@ def check_len(item, name): return result -def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False): +def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, + sparse=False): # Series avoids inconsistent NaN handling cat = Categorical.from_array(Series(data), ordered=True) levels = cat.categories @@ -1099,8 +1094,7 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False): number_of_cols = len(levels) if prefix is not None: - dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) - for v in levels] + dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) for v in levels] else: dummy_cols = levels @@ -1112,7 +1106,7 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False): if sparse: sparse_series = {} N = len(data) - sp_indices = [ [] for _ in range(len(dummy_cols)) ] + sp_indices = [[] for _ in range(len(dummy_cols))] for ndx, code in enumerate(codes): if code == -1: # Blank entries if not dummy_na and code == -1, #GH4446 @@ -1120,8 +1114,8 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False): sp_indices[code].append(ndx) for col, ixs in zip(dummy_cols, sp_indices): - sarr = SparseArray(np.ones(len(ixs)), sparse_index=IntIndex(N, ixs), - fill_value=0) + sarr = SparseArray(np.ones(len(ixs)), + sparse_index=IntIndex(N, ixs), fill_value=0) sparse_series[col] = SparseSeries(data=sarr, index=index) return SparseDataFrame(sparse_series, index=index, columns=dummy_cols) @@ -1157,10 +1151,7 @@ def make_axis_dummies(frame, axis='minor', transform=None): dummies : DataFrame Column names taken from chosen axis """ - numbers = { - 'major': 0, - 'minor': 1 - } + numbers = {'major': 0, 'minor': 1} num = numbers.get(axis, axis) items = frame.index.levels[num] diff --git a/pandas/core/series.py b/pandas/core/series.py index 73e645039506f..73cca93a498c5 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -17,11 +17,10 @@ _default_index, _maybe_upcast, _asarray_tuplesafe, _infer_dtype_from_scalar, is_list_like, _values_from_object, - is_categorical_dtype, is_datetime64tz_dtype, - needs_i8_conversion, i8_boxer, - _possibly_cast_to_datetime, _possibly_castable, - _possibly_convert_platform, _try_sort, - is_int64_dtype, is_internal_type, is_datetimetz, + is_categorical_dtype, needs_i8_conversion, + i8_boxer, _possibly_cast_to_datetime, + _possibly_castable, _possibly_convert_platform, + _try_sort, is_internal_type, is_datetimetz, _maybe_match_name, ABCSparseArray, _coerce_to_dtype, SettingWithCopyError, _maybe_box_datetimelike, ABCDataFrame, @@ -42,6 +41,7 @@ from pandas.util.terminal import get_terminal_size from pandas.compat import zip, u, OrderedDict, StringIO + import pandas.core.ops as ops from pandas.core import algorithms @@ -49,7 +49,7 @@ import pandas.core.datetools as datetools import pandas.core.format as fmt import pandas.core.nanops as nanops -from pandas.util.decorators import Appender, cache_readonly, deprecate_kwarg +from pandas.util.decorators import Appender, deprecate_kwarg import pandas.lib as lib import pandas.tslib as tslib @@ -62,15 +62,11 @@ __all__ = ['Series'] - _shared_doc_kwargs = dict( - axes='index', - klass='Series', - axes_single_arg="{0, 'index'}", + axes='index', klass='Series', axes_single_arg="{0, 'index'}", inplace="""inplace : boolean, default False - If True, performs operation inplace and returns None.""", - duplicated='Series' -) + If True, performs operation inplace and returns None.""", + duplicated='Series') def _coerce_method(converter): @@ -79,17 +75,17 @@ def _coerce_method(converter): def wrapper(self): if len(self) == 1: return converter(self.iloc[0]) - raise TypeError( - "cannot convert the series to {0}".format(str(converter))) - return wrapper + raise TypeError("cannot convert the series to " + "{0}".format(str(converter))) + return wrapper -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Series class -class Series(base.IndexOpsMixin, strings.StringAccessorMixin, generic.NDFrame,): - +class Series(base.IndexOpsMixin, strings.StringAccessorMixin, + generic.NDFrame,): """ One-dimensional ndarray with axis labels (including time series). @@ -182,14 +178,14 @@ def __init__(self, data=None, index=None, dtype=None, name=None, else: data = np.nan elif isinstance(index, PeriodIndex): - data = [data.get(i, nan) - for i in index] if data else np.nan + data = ([data.get(i, nan) for i in index] + if data else np.nan) else: data = lib.fast_multiget(data, index.values, default=np.nan) except TypeError: - data = [data.get(i, nan) - for i in index] if data else np.nan + data = ([data.get(i, nan) for i in index] + if data else np.nan) elif isinstance(data, SingleBlockManager): if index is None: @@ -198,7 +194,8 @@ def __init__(self, data=None, index=None, dtype=None, name=None, data = data.reindex(index, copy=copy) elif isinstance(data, Categorical): if dtype is not None: - raise ValueError("cannot specify a dtype with a Categorical") + raise ValueError("cannot specify a dtype with a " + "Categorical") elif (isinstance(data, types.GeneratorType) or (compat.PY3 and isinstance(data, map))): data = list(data) @@ -241,7 +238,8 @@ def from_array(cls, arr, index=None, name=None, dtype=None, copy=False, from pandas.sparse.series import SparseSeries cls = SparseSeries - return cls(arr, index=index, name=name, dtype=dtype, copy=copy, fastpath=fastpath) + return cls(arr, index=index, name=name, dtype=dtype, copy=copy, + fastpath=fastpath) @property def _constructor(self): @@ -259,12 +257,11 @@ def _can_hold_na(self): @property def is_time_series(self): - msg = "is_time_series is deprecated. Please use Series.index.is_all_dates" - warnings.warn(msg, FutureWarning, stacklevel=2) + warnings.warn("is_time_series is deprecated. Please use " + "Series.index.is_all_dates", FutureWarning, stacklevel=2) # return self._subtyp in ['time_series', 'sparse_time_series'] return self.index.is_all_dates - _index = None def _set_axis(self, axis, labels, fastpath=False): @@ -276,7 +273,8 @@ def _set_axis(self, axis, labels, fastpath=False): is_all_dates = labels.is_all_dates if is_all_dates: - if not isinstance(labels, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): + if not isinstance(labels, + (DatetimeIndex, PeriodIndex, TimedeltaIndex)): labels = DatetimeIndex(labels) # need to set here becuase we changed the index @@ -343,7 +341,8 @@ def values(self): Timezone aware datetime data is converted to UTC: - >>> pd.Series(pd.date_range('20130101',periods=3,tz='US/Eastern')).values + >>> pd.Series(pd.date_range('20130101', periods=3, + tz='US/Eastern')).values array(['2013-01-01T00:00:00.000000000-0500', '2013-01-02T00:00:00.000000000-0500', '2013-01-03T00:00:00.000000000-0500'], dtype='datetime64[ns]') @@ -449,9 +448,10 @@ def __array_prepare__(self, result, context=None): if context is not None and not isinstance(self._values, np.ndarray): obj = context[1][0] raise TypeError("{obj} with dtype {dtype} cannot perform " - "the numpy op {op}".format(obj=type(obj).__name__, - dtype=getattr(obj,'dtype',None), - op=context[0].__name__)) + "the numpy op {op}".format( + obj=type(obj).__name__, + dtype=getattr(obj, 'dtype', None), + op=context[0].__name__)) return result # complex @@ -508,9 +508,7 @@ def _unpickle_series_compat(self, state): # indexers @property def axes(self): - """ - Return a list of the row axis labels - """ + """Return a list of the row axis labels""" return [self.index] def _ixs(self, i, axis=0): @@ -551,7 +549,8 @@ def _is_mixed_type(self): return False def _slice(self, slobj, axis=0, kind=None): - slobj = self.index._convert_slice_indexer(slobj, kind=kind or 'getitem') + slobj = self.index._convert_slice_indexer(slobj, + kind=kind or 'getitem') return self._get_values(slobj) def __getitem__(self, key): @@ -564,9 +563,9 @@ def __getitem__(self, key): # we need to box if we have a non-unique index here # otherwise have inline ndarray/lists if not self.index.is_unique: - result = self._constructor(result, - index=[key]*len(result) - ,dtype=self.dtype).__finalize__(self) + result = self._constructor( + result, index=[key] * len(result), + dtype=self.dtype).__finalize__(self) return result except InvalidIndexError: @@ -582,7 +581,8 @@ def __getitem__(self, key): else: # we can try to coerce the indexer (or this will raise) - new_key = self.index._convert_scalar_indexer(key,kind='getitem') + new_key = self.index._convert_scalar_indexer(key, + kind='getitem') if type(new_key) != type(key): return self.__getitem__(new_key) raise @@ -604,8 +604,8 @@ def _get_with(self, key): indexer = self.index._convert_slice_indexer(key, kind='getitem') return self._get_values(indexer) elif isinstance(key, ABCDataFrame): - raise TypeError('Indexing a Series with DataFrame is not supported, '\ - 'use the appropriate DataFrame column') + raise TypeError('Indexing a Series with DataFrame is not ' + 'supported, use the appropriate DataFrame column') else: if isinstance(key, tuple): try: @@ -669,7 +669,6 @@ def _get_values(self, indexer): return self._values[indexer] def __setitem__(self, key, value): - def setitem(key, value): try: self._set_with_engine(key, value) @@ -678,8 +677,8 @@ def setitem(key, value): raise except (KeyError, ValueError): values = self._values - if (com.is_integer(key) - and not self.index.inferred_type == 'integer'): + if (com.is_integer(key) and + not self.index.inferred_type == 'integer'): values[key] = value return @@ -694,17 +693,18 @@ def setitem(key, value): value = tslib.iNaT try: - self.index._engine.set_value(self._values, key, value) + self.index._engine.set_value(self._values, key, + value) return - except (TypeError): + except TypeError: pass self.loc[key] = value return except TypeError as e: - if isinstance(key, tuple) and not isinstance(self.index, - MultiIndex): + if (isinstance(key, tuple) and + not isinstance(self.index, MultiIndex)): raise ValueError("Can only tuple-index with a MultiIndex") # python 3 type errors should be raised @@ -716,7 +716,7 @@ def setitem(key, value): try: self.where(~key, value, inplace=True) return - except (InvalidIndexError): + except InvalidIndexError: pass self._set_with(key, value) @@ -752,7 +752,7 @@ def _set_with(self, key, value): try: key = list(key) except: - key = [ key ] + key = [key] if isinstance(key, Index): key_type = key.inferred_type @@ -777,8 +777,7 @@ def _set_labels(self, key, value): indexer = self.index.get_indexer(key) mask = indexer == -1 if mask.any(): - raise ValueError('%s not contained in the index' - % str(key[mask])) + raise ValueError('%s not contained in the index' % str(key[mask])) self._set_values(indexer, value) def _set_values(self, key, value): @@ -828,8 +827,8 @@ def iget_value(self, i, axis=0): """ DEPRECATED. Use ``.iloc[i]`` or ``.iat[i]`` instead """ - warnings.warn("iget_value(i) is deprecated. Please use .iloc[i] or .iat[i]", - FutureWarning, stacklevel=2) + warnings.warn("iget_value(i) is deprecated. Please use .iloc[i] or " + ".iat[i]", FutureWarning, stacklevel=2) return self._ixs(i) def iget(self, i, axis=0): @@ -951,8 +950,8 @@ def __unicode__(self): """ buf = StringIO(u("")) width, height = get_terminal_size() - max_rows = (height if get_option("display.max_rows") == 0 - else get_option("display.max_rows")) + max_rows = (height if get_option("display.max_rows") == 0 else + get_option("display.max_rows")) self.to_string(buf=buf, name=self.name, dtype=self.dtype, max_rows=max_rows) @@ -961,7 +960,8 @@ def __unicode__(self): return result def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True, - index=True, length=False, dtype=False, name=False, max_rows=None): + index=True, length=False, dtype=False, name=False, + max_rows=None): """ Render a string representation of the Series @@ -1012,17 +1012,15 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True, with open(buf, 'w') as f: f.write(the_repr) - def _get_repr( - self, name=False, header=True, index=True, length=True, dtype=True, - na_rep='NaN', float_format=None, max_rows=None): + def _get_repr(self, name=False, header=True, index=True, length=True, + dtype=True, na_rep='NaN', float_format=None, max_rows=None): """ Internal function, should always return unicode string """ - formatter = fmt.SeriesFormatter(self, name=name, - length=length, header=header, - index=index, dtype=dtype, - na_rep=na_rep, + formatter = fmt.SeriesFormatter(self, name=name, length=length, + header=header, index=index, + dtype=dtype, na_rep=na_rep, float_format=float_format, max_rows=max_rows) result = formatter.to_string() @@ -1052,11 +1050,11 @@ def iteritems(self): if compat.PY3: # pragma: no cover items = iteritems - #---------------------------------------------------------------------- + # ---------------------------------------------------------------------- # Misc public methods def keys(self): - "Alias for index" + """Alias for index""" return self.index def tolist(self): @@ -1111,7 +1109,7 @@ def to_sparse(self, kind='block', fill_value=None): return SparseSeries(self, kind=kind, fill_value=fill_value).__finalize__(self) - #---------------------------------------------------------------------- + # ---------------------------------------------------------------------- # Statistics, overridden ndarray methods # TODO: integrate bottleneck @@ -1148,7 +1146,8 @@ def count(self, level=None): obs = lab[notnull(self.values)] out = np.bincount(obs, minlength=len(lev) or None) - return self._constructor(out, index=lev, dtype='int64').__finalize__(self) + return self._constructor(out, index=lev, + dtype='int64').__finalize__(self) def mode(self): """Returns the mode(s) of the dataset. @@ -1169,12 +1168,14 @@ def mode(self): # TODO: Add option for bins like value_counts() return algorithms.mode(self) - @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', + False: 'first'}) @Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs) def drop_duplicates(self, keep='first', inplace=False): return super(Series, self).drop_duplicates(keep=keep, inplace=inplace) - @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', + False: 'first'}) @Appender(base._shared_docs['duplicated'] % _shared_doc_kwargs) def duplicated(self, keep='first'): return super(Series, self).duplicated(keep=keep) @@ -1258,8 +1259,7 @@ def round(self, decimals=0): """ result = _values_from_object(self).round(decimals) - result = self._constructor(result, - index=self.index).__finalize__(self) + result = self._constructor(result, index=self.index).__finalize__(self) return result @@ -1324,8 +1324,7 @@ def multi(values, qs, **kwargs): return self._maybe_box(lambda values: multi(values, q, **kwargs), dropna=True) - def corr(self, other, method='pearson', - min_periods=None): + def corr(self, other, method='pearson', min_periods=None): """ Compute correlation with `other` Series, excluding missing values @@ -1503,7 +1502,7 @@ def searchsorted(self, v, side='left', sorter=None): return self._values.searchsorted(Series(v)._values, side=side, sorter=sorter) - #------------------------------------------------------------------------------ + # ------------------------------------------------------------------- # Combination def append(self, to_append, verify_integrity=False): @@ -1585,7 +1584,8 @@ def _binop(self, other, func, level=None, fill_value=None): this = self if not self.index.equals(other.index): - this, other = self.align(other, level=level, join='outer', copy=False) + this, other = self.align(other, level=level, join='outer', + copy=False) new_index = this.index this_vals = this.values @@ -1657,7 +1657,8 @@ def combine_first(self, other): new_index = self.index.union(other.index) this = self.reindex(new_index, copy=False) other = other.reindex(new_index, copy=False) - name = _maybe_match_name(self, other) + # TODO: do we need name? + name = _maybe_match_name(self, other) # noqa rs_vals = com._where_compat(isnull(this), other._values, this._values) return self._constructor(rs_vals, index=new_index).__finalize__(self) @@ -1676,7 +1677,7 @@ def update(self, other): self._data = self._data.putmask(mask=mask, new=other, inplace=True) self._maybe_update_cacher() - #---------------------------------------------------------------------- + # ---------------------------------------------------------------------- # Reindexing, sorting @Appender(generic._shared_docs['sort_values'] % _shared_doc_kwargs) @@ -1750,19 +1751,21 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, ascending=ascending) new_values = self._values.take(indexer) - result = self._constructor(new_values, index=new_index) + result = self._constructor(new_values, index=new_index) if inplace: self._update_inplace(result) else: return result.__finalize__(self) - def sort(self, axis=0, ascending=True, kind='quicksort', na_position='last', inplace=True): + def sort(self, axis=0, ascending=True, kind='quicksort', + na_position='last', inplace=True): """ - DEPRECATED: use :meth:`Series.sort_values(inplace=True)` for INPLACE sorting + DEPRECATED: use :meth:`Series.sort_values(inplace=True)` for INPLACE + sorting - Sort values and index labels by value. This is an inplace sort by default. - Series.order is the equivalent but returns a new Series. + Sort values and index labels by value. This is an inplace sort by + default. Series.order is the equivalent but returns a new Series. Parameters ---------- @@ -1782,24 +1785,24 @@ def sort(self, axis=0, ascending=True, kind='quicksort', na_position='last', inp -------- Series.sort_values """ - warnings.warn("sort is deprecated, use sort_values(inplace=True) for for INPLACE sorting", - FutureWarning, stacklevel=2) + warnings.warn("sort is deprecated, use sort_values(inplace=True) for " + "INPLACE sorting", FutureWarning, stacklevel=2) - return self.sort_values(ascending=ascending, - kind=kind, - na_position=na_position, - inplace=inplace) + return self.sort_values(ascending=ascending, kind=kind, + na_position=na_position, inplace=inplace) - def order(self, na_last=None, ascending=True, kind='quicksort', na_position='last', inplace=False): + def order(self, na_last=None, ascending=True, kind='quicksort', + na_position='last', inplace=False): """ DEPRECATED: use :meth:`Series.sort_values` Sorts Series object, by value, maintaining index-value link. - This will return a new Series by default. Series.sort is the equivalent but as an inplace method. + This will return a new Series by default. Series.sort is the equivalent + but as an inplace method. Parameters ---------- - na_last : boolean (optional, default=True) (DEPRECATED; use na_position) + na_last : boolean (optional, default=True)--DEPRECATED; use na_position Put NaN's at beginning or end ascending : boolean, default True Sort ascending. Passing False sorts descending @@ -1823,10 +1826,8 @@ def order(self, na_last=None, ascending=True, kind='quicksort', na_position='las warnings.warn("order is deprecated, use sort_values(...)", FutureWarning, stacklevel=2) - return self.sort_values(ascending=ascending, - kind=kind, - na_position=na_position, - inplace=inplace) + return self.sort_values(ascending=ascending, kind=kind, + na_position=na_position, inplace=inplace) def argsort(self, axis=0, kind='quicksort', order=None): """ @@ -1853,8 +1854,8 @@ def argsort(self, axis=0, kind='quicksort', order=None): mask = isnull(values) if mask.any(): - result = Series( - -1, index=self.index, name=self.name, dtype='int64') + result = Series(-1, index=self.index, name=self.name, + dtype='int64') notmask = ~mask result[notmask] = np.argsort(values[notmask], kind=kind) return self._constructor(result, @@ -1889,11 +1890,13 @@ def rank(self, method='average', na_option='keep', ascending=True, ------- ranks : Series """ - ranks = algorithms.rank(self._values, method=method, na_option=na_option, - ascending=ascending, pct=pct) + ranks = algorithms.rank(self._values, method=method, + na_option=na_option, ascending=ascending, + pct=pct) return self._constructor(ranks, index=self.index).__finalize__(self) - @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', + False: 'first'}) def nlargest(self, n=5, keep='first'): """Return the largest `n` elements. @@ -1914,8 +1917,8 @@ def nlargest(self, n=5, keep='first'): Notes ----- - Faster than ``.sort_values(ascending=False).head(n)`` for small `n` relative - to the size of the ``Series`` object. + Faster than ``.sort_values(ascending=False).head(n)`` for small `n` + relative to the size of the ``Series`` object. See Also -------- @@ -1930,7 +1933,8 @@ def nlargest(self, n=5, keep='first'): """ return algorithms.select_n(self, n=n, keep=keep, method='nlargest') - @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', + False: 'first'}) def nsmallest(self, n=5, keep='first'): """Return the smallest `n` elements. @@ -1987,7 +1991,8 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): Series.sort_index(level=...) """ - return self.sort_index(level=level, ascending=ascending, sort_remaining=sort_remaining) + return self.sort_index(level=level, ascending=ascending, + sort_remaining=sort_remaining) def swaplevel(self, i, j, copy=True): """ @@ -2063,7 +2068,7 @@ def unstack(self, level=-1): from pandas.core.reshape import unstack return unstack(self, level) - #---------------------------------------------------------------------- + # ---------------------------------------------------------------------- # function application def map(self, arg, na_action=None): @@ -2259,8 +2264,8 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, # Validate that 'axis' is consistent with Series's single axis. self._get_axis_number(axis) if numeric_only: - raise NotImplementedError( - 'Series.{0} does not implement numeric_only.'.format(name)) + raise NotImplementedError('Series.{0} does not implement ' + 'numeric_only.'.format(name)) return op(delegate, skipna=skipna, **kwds) return delegate._reduce(op=op, name=name, axis=axis, skipna=skipna, @@ -2326,9 +2331,11 @@ def _needs_reindex_multi(self, axes, method, level): def align(self, other, join='outer', axis=None, level=None, copy=True, fill_value=None, method=None, limit=None, fill_axis=0, broadcast_axis=None): - return super(Series, self).align(other, join=join, axis=axis, level=level, copy=copy, - fill_value=fill_value, method=method, limit=limit, - fill_axis=fill_axis, broadcast_axis=broadcast_axis) + return super(Series, self).align(other, join=join, axis=axis, + level=level, copy=copy, + fill_value=fill_value, method=method, + limit=limit, fill_axis=fill_axis, + broadcast_axis=broadcast_axis) @Appender(generic._shared_docs['rename'] % _shared_doc_kwargs) def rename(self, index=None, **kwargs): @@ -2348,8 +2355,7 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, @Appender(generic._shared_docs['shift'] % _shared_doc_kwargs) def shift(self, periods=1, freq=None, axis=0): - return super(Series, self).shift(periods=periods, freq=freq, - axis=axis) + return super(Series, self).shift(periods=periods, freq=freq, axis=axis) def reindex_axis(self, labels, axis=0, **kwargs): """ for compatibility with higher dims """ @@ -2405,8 +2411,7 @@ def take(self, indices, axis=0, convert=True, is_copy=False): """ # check/convert indicies here if convert: - indices = maybe_convert_indices( - indices, len(self._get_axis(axis))) + indices = maybe_convert_indices(indices, len(self._get_axis(axis))) indices = com._ensure_platform_int(indices) new_index = self.index.take(indices) @@ -2492,7 +2497,8 @@ def between(self, left, right, inclusive=True): def from_csv(cls, path, sep=',', parse_dates=True, header=None, index_col=0, encoding=None, infer_datetime_format=False): """ - Read CSV file (DISCOURAGED, please use :func:`pandas.read_csv` instead). + Read CSV file (DISCOURAGED, please use :func:`pandas.read_csv` + instead). It is preferable to use the more powerful :func:`pandas.read_csv` for most general purposes, but ``from_csv`` makes for an easy @@ -2544,16 +2550,15 @@ def from_csv(cls, path, sep=',', parse_dates=True, header=None, sep=sep, parse_dates=parse_dates, encoding=encoding, infer_datetime_format=infer_datetime_format) - result = df.iloc[:,0] + result = df.iloc[:, 0] if header is None: result.index.name = result.name = None return result - def to_csv(self, path, index=True, sep=",", na_rep='', - float_format=None, header=False, - index_label=None, mode='w', nanRep=None, encoding=None, - date_format=None, decimal='.'): + def to_csv(self, path, index=True, sep=",", na_rep='', float_format=None, + header=False, index_label=None, mode='w', nanRep=None, + encoding=None, date_format=None, decimal='.'): """ Write Series to a comma-separated values (csv) file @@ -2582,15 +2587,17 @@ def to_csv(self, path, index=True, sep=",", na_rep='', date_format: string, default None Format string for datetime objects. decimal: string, default '.' - Character recognized as decimal separator. E.g. use ',' for European data + Character recognized as decimal separator. E.g. use ',' for + European data """ from pandas.core.frame import DataFrame df = DataFrame(self) # result is only a string if no path provided, otherwise None result = df.to_csv(path, index=index, sep=sep, na_rep=na_rep, - float_format=float_format, header=header, - index_label=index_label, mode=mode, nanRep=nanRep, - encoding=encoding, date_format=date_format, decimal=decimal) + float_format=float_format, header=header, + index_label=index_label, mode=mode, nanRep=nanRep, + encoding=encoding, date_format=date_format, + decimal=decimal) if path is None: return result @@ -2607,7 +2614,7 @@ def dropna(self, axis=0, inplace=False, **kwargs): kwargs.pop('how', None) if kwargs: raise TypeError('dropna() got an unexpected keyword ' - 'argument "{0}"'.format(list(kwargs.keys())[0])) + 'argument "{0}"'.format(list(kwargs.keys())[0])) axis = self._get_axis_number(axis or 0) @@ -2655,7 +2662,7 @@ def last_valid_index(self): else: return self.index[len(self) - i - 1] - #---------------------------------------------------------------------- + # ---------------------------------------------------------------------- # Time series-oriented methods def asof(self, where): @@ -2749,7 +2756,7 @@ def to_period(self, freq=None, copy=True): return self._constructor(new_values, index=new_index).__finalize__(self) - #------------------------------------------------------------------------------ + # ------------------------------------------------------------------------- # Datetimelike delegation methods def _make_dt_accessor(self): @@ -2759,9 +2766,10 @@ def _make_dt_accessor(self): raise AttributeError("Can only use .dt accessor with datetimelike " "values") - dt = base.AccessorProperty(CombinedDatetimelikeProperties, _make_dt_accessor) + dt = base.AccessorProperty(CombinedDatetimelikeProperties, + _make_dt_accessor) - #------------------------------------------------------------------------------ + # ------------------------------------------------------------------------- # Categorical methods def _make_cat_accessor(self): @@ -2785,14 +2793,14 @@ def _dir_additions(self): pass return rv -Series._setup_axes(['index'], info_axis=0, stat_axis=0, - aliases={'rows': 0}) + +Series._setup_axes(['index'], info_axis=0, stat_axis=0, aliases={'rows': 0}) Series._add_numeric_operations() Series._add_series_only_operations() Series._add_series_or_dataframe_operations() _INDEX_TYPES = ndarray, Index, list, tuple -#------------------------------------------------------------------------------ +# ----------------------------------------------------------------------------- # Supplementary functions @@ -2804,14 +2812,15 @@ def remove_na(series): def _sanitize_index(data, index, copy=False): - """ sanitize an index type to return an ndarray of the underlying, pass thru a non-Index """ + """ sanitize an index type to return an ndarray of the underlying, pass + thru a non-Index + """ if index is None: return data if len(data) != len(index): - raise ValueError('Length of values does not match length of ' - 'index') + raise ValueError('Length of values does not match length of ' 'index') if isinstance(data, PeriodIndex): data = data.asobject @@ -2822,14 +2831,17 @@ def _sanitize_index(data, index, copy=False): elif isinstance(data, np.ndarray): # coerce datetimelike types - if data.dtype.kind in ['M','m']: + if data.dtype.kind in ['M', 'm']: data = _sanitize_array(data, index, copy=copy) return data + def _sanitize_array(data, index, dtype=None, copy=False, raise_cast_failure=False): - """ sanitize input data to an ndarray, copy if specified, coerce to the dtype if specified """ + """ sanitize input data to an ndarray, copy if specified, coerce to the + dtype if specified + """ if dtype is not None: dtype = _coerce_to_dtype(dtype) @@ -2878,7 +2890,8 @@ def _try_cast(arr, take_fast_path): subarr = _try_cast(data, True) elif isinstance(data, Index): # don't coerce Index types - # e.g. indexes can have different conversions (so don't fast path them) + # e.g. indexes can have different conversions (so don't fast path + # them) # GH 6140 subarr = _sanitize_index(data, index, copy=True) else: @@ -2916,7 +2929,7 @@ def create_from_value(value, index, dtype): # return a new empty value suitable for the dtype if is_datetimetz(dtype): - subarr = DatetimeIndex([value]*len(index)) + subarr = DatetimeIndex([value] * len(index)) else: if not isinstance(dtype, (np.dtype, type(np.dtype))): dtype = dtype.dtype @@ -2965,9 +2978,9 @@ def create_from_value(value, index, dtype): return subarr + # backwards compatiblity class TimeSeries(Series): - def __init__(self, *args, **kwargs): # deprecation TimeSeries, #10890 warnings.warn("TimeSeries is deprecated. Please use Series", @@ -2975,12 +2988,13 @@ def __init__(self, *args, **kwargs): super(TimeSeries, self).__init__(*args, **kwargs) -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Add plotting methods to Series -import pandas.tools.plotting as _gfx +import pandas.tools.plotting as _gfx # noqa -Series.plot = base.AccessorProperty(_gfx.SeriesPlotMethods, _gfx.SeriesPlotMethods) +Series.plot = base.AccessorProperty(_gfx.SeriesPlotMethods, + _gfx.SeriesPlotMethods) Series.hist = _gfx.hist_series # Add arithmetic! diff --git a/pandas/core/window.py b/pandas/core/window.py index ce8fda9e932bc..04103893a5e55 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -9,12 +9,11 @@ import warnings import numpy as np -from functools import wraps from collections import defaultdict import pandas as pd from pandas.lib import isscalar -from pandas.core.base import PandasObject, SelectionMixin, AbstractMethodError +from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com import pandas.algos as algos from pandas import compat @@ -34,17 +33,19 @@ pandas.DataFrame.%(name)s """ + class _Window(PandasObject, SelectionMixin): - _attributes = ['window','min_periods','freq','center','win_type','axis'] + _attributes = ['window', 'min_periods', 'freq', 'center', 'win_type', + 'axis'] exclusions = set() - def __init__(self, obj, window=None, min_periods=None, freq=None, center=False, - win_type=None, axis=0): + def __init__(self, obj, window=None, min_periods=None, freq=None, + center=False, win_type=None, axis=0): if freq is not None: - warnings.warn("The freq kw is deprecated and will be removed in a future version. You can resample prior " - "to passing to a window function", - FutureWarning, stacklevel=3) + warnings.warn("The freq kw is deprecated and will be removed in a " + "future version. You can resample prior to passing " + "to a window function", FutureWarning, stacklevel=3) self.blocks = [] self.obj = obj @@ -67,11 +68,13 @@ def _convert_freq(self, how=None): """ resample according to the how, return a new object """ obj = self._selected_obj - if self.freq is not None and isinstance(obj, (com.ABCSeries, com.ABCDataFrame)): + if (self.freq is not None and + isinstance(obj, (com.ABCSeries, com.ABCDataFrame))): if how is not None: - warnings.warn("The how kw argument is deprecated and removed in a future version. You can resample prior " - "to passing to a window function", - FutureWarning, stacklevel=6) + warnings.warn("The how kw argument is deprecated and removed " + "in a future version. You can resample prior " + "to passing to a window function", FutureWarning, + stacklevel=6) obj = obj.resample(self.freq, how=how) return obj @@ -101,7 +104,7 @@ def _gotitem(self, key, ndim, subset=None): subset = self.obj self = self._shallow_copy(subset) self._reset_cache() - if subset.ndim==2: + if subset.ndim == 2: if isscalar(key) and key in subset or com.is_list_like(key): self._selection = key return self @@ -124,8 +127,9 @@ def _get_window(self, other=None): def __unicode__(self): """ provide a nice str repr of our rolling object """ - attrs = [ "{k}={v}".format(k=k,v=getattr(self,k)) \ - for k in self._attributes if getattr(self,k,None) is not None ] + attrs = ["{k}={v}".format(k=k, v=getattr(self, k)) + for k in self._attributes + if getattr(self, k, None) is not None] return "{klass} [{attrs}]".format(klass=self.__class__.__name__, attrs=','.join(attrs)) @@ -137,13 +141,13 @@ def _shallow_copy(self, obj=None, **kwargs): obj = obj.obj for attr in self._attributes: if attr not in kwargs: - kwargs[attr] = getattr(self,attr) + kwargs[attr] = getattr(self, attr) return self._constructor(obj, **kwargs) def _prep_values(self, values=None, kill_inf=True, how=None): if values is None: - values = getattr(self._selected_obj,'values',self._selected_obj) + values = getattr(self._selected_obj, 'values', self._selected_obj) # coerce dtypes as appropriate if com.is_float_dtype(values.dtype): @@ -156,7 +160,8 @@ def _prep_values(self, values=None, kill_inf=True, how=None): try: values = values.astype(float) except (ValueError, TypeError): - raise TypeError("cannot handle this type -> {0}".format(values.dtype)) + raise TypeError("cannot handle this type -> {0}" + "".format(values.dtype)) if kill_inf: values = values.copy() @@ -174,15 +179,14 @@ def _wrap_result(self, result, block=None, obj=None): # coerce if necessary if block is not None: if com.is_timedelta64_dtype(block.values.dtype): - result = pd.to_timedelta(result.ravel(),unit='ns').values.reshape(result.shape) + result = pd.to_timedelta( + result.ravel(), unit='ns').values.reshape(result.shape) if result.ndim == 1: from pandas import Series return Series(result, obj.index, name=obj.name) - return type(obj)(result, - index=obj.index, - columns=block.columns) + return type(obj)(result, index=obj.index, columns=block.columns) return result def _wrap_results(self, results, blocks, obj): @@ -206,11 +210,11 @@ def _wrap_results(self, results, blocks, obj): if not len(final): return obj.astype('float64') - return pd.concat(final,axis=1).reindex(columns=obj.columns) + return pd.concat(final, axis=1).reindex(columns=obj.columns) def _center_window(self, result, window): """ center the result in the window """ - if self.axis > result.ndim-1: + if self.axis > result.ndim - 1: raise ValueError("Requested axis is larger then no. of argument " "dimensions") @@ -249,6 +253,7 @@ def aggregate(self, arg, *args, **kwargs): how : string, default None (DEPRECATED) Method for down- or re-sampling""") + class Window(_Window): """ Provides rolling transformations. @@ -264,8 +269,8 @@ class Window(_Window): Minimum number of observations in window required to have a value (otherwise result is NA). freq : string or DateOffset object, optional (default None) (DEPRECATED) - Frequency to conform the data to before computing the statistic. Specified - as a frequency string or DateOffset object. + Frequency to conform the data to before computing the statistic. + Specified as a frequency string or DateOffset object. center : boolean, default False Set the labels at the center of the window. win_type : string, default None @@ -313,8 +318,10 @@ def _prep_window(self, **kwargs): try: import scipy.signal as sig except ImportError: - raise ImportError('Please install scipy to generate window weight') - win_type = _validate_win_type(self.win_type, kwargs) # may pop from kwargs + raise ImportError('Please install scipy to generate window ' + 'weight') + # the below may pop from kwargs + win_type = _validate_win_type(self.win_type, kwargs) return sig.get_window(win_type, window).astype(float) raise ValueError('Invalid window %s' % str(window)) @@ -353,10 +360,12 @@ def _apply_window(self, mean=True, how=None, **kwargs): offset = _offset(window, center) additional_nans = np.array([np.NaN] * offset) + def f(arg, *args, **kwargs): minp = _use_window(self.min_periods, len(window)) - return algos.roll_window(np.concatenate((arg, additional_nans)) if center else arg, - window, minp, avg=mean) + return algos.roll_window(np.concatenate((arg, additional_nans)) + if center else arg, window, minp, + avg=mean) result = np.apply_along_axis(f, self.axis, values) @@ -392,13 +401,14 @@ def sum(self, **kwargs): def mean(self, **kwargs): return self._apply_window(mean=True, **kwargs) -class _Rolling(_Window): +class _Rolling(_Window): @property def _constructor(self): return Rolling - def _apply(self, func, window=None, center=None, check_minp=None, how=None, **kwargs): + def _apply(self, func, window=None, center=None, check_minp=None, how=None, + **kwargs): """ Rolling statistical measure using supplied function. Designed to be used with passed-in Cython array-based functions. @@ -440,9 +450,11 @@ def _apply(self, func, window=None, center=None, check_minp=None, how=None, **kw # if we have a string function name, wrap it if isinstance(func, compat.string_types): if not hasattr(algos, func): - raise ValueError("we do not support this function algos.{0}".format(func)) + raise ValueError("we do not support this function " + "algos.{0}".format(func)) cfunc = getattr(algos, func) + def func(arg, window, min_periods=None): minp = check_minp(min_periods, window) return cfunc(arg, window, minp, **kwargs) @@ -451,12 +463,14 @@ def func(arg, window, min_periods=None): if center: offset = _offset(window, center) additional_nans = np.array([np.NaN] * offset) + def calc(x): return func(np.concatenate((x, additional_nans)), window, min_periods=self.min_periods) else: + def calc(x): - return func(x,window, min_periods=self.min_periods) + return func(x, window, min_periods=self.min_periods) if values.ndim > 1: result = np.apply_along_axis(calc, self.axis, values) @@ -470,9 +484,12 @@ def calc(x): return self._wrap_results(results, blocks, obj) + class _Rolling_and_Expanding(_Rolling): - _shared_docs['count'] = """%(name)s count of number of non-NaN observations inside provided window.""" + _shared_docs['count'] = """%(name)s count of number of non-NaN + observations inside provided window.""" + def count(self): obj = self._convert_freq() window = self._get_window() @@ -481,9 +498,7 @@ def count(self): converted = np.isfinite(obj).astype(float) except TypeError: converted = np.isfinite(obj.astype(float)).astype(float) - result = self._constructor(converted, - window=window, - min_periods=0, + result = self._constructor(converted, window=window, min_periods=0, center=self.center).sum() result[result.isnull()] = 0 @@ -499,12 +514,15 @@ def count(self): *args and **kwargs are passed to the function""") def apply(self, func, args=(), kwargs={}): - _level = kwargs.pop('_level',None) + # TODO: _level is unused? + _level = kwargs.pop('_level', None) # noqa window = self._get_window() offset = _offset(window, self.center) + def f(arg, window, min_periods): minp = _use_window(min_periods, window) - return algos.roll_generic(arg, window, minp, offset, func, args, kwargs) + return algos.roll_generic(arg, window, minp, offset, func, args, + kwargs) return self._apply(f, center=False) @@ -518,6 +536,7 @@ def sum(self, **kwargs): ---------- how : string, default 'max' (DEPRECATED) Method for down- or re-sampling""") + def max(self, how=None, **kwargs): if self.freq is not None and how is None: how = 'max' @@ -530,6 +549,7 @@ def max(self, how=None, **kwargs): ---------- how : string, default 'min' (DEPRECATED) Method for down- or re-sampling""") + def min(self, how=None, **kwargs): if self.freq is not None and how is None: how = 'min' @@ -545,6 +565,7 @@ def mean(self, **kwargs): ---------- how : string, default 'median' (DEPRECATED) Method for down- or re-sampling""") + def median(self, how=None, **kwargs): if self.freq is not None and how is None: how = 'median' @@ -561,6 +582,7 @@ def median(self, how=None, **kwargs): def std(self, ddof=1, **kwargs): window = self._get_window() + def f(arg, *args, **kwargs): minp = _require_min_periods(1)(self.min_periods, window) return _zsqrt(algos.roll_var(arg, window, minp, ddof)) @@ -577,21 +599,19 @@ def f(arg, *args, **kwargs): is ``N - ddof``, where ``N`` represents the number of elements.""") def var(self, ddof=1, **kwargs): - return self._apply('roll_var', - check_minp=_require_min_periods(1), - ddof=ddof, - **kwargs) + return self._apply('roll_var', check_minp=_require_min_periods(1), + ddof=ddof, **kwargs) _shared_docs['skew'] = """Unbiased %(name)s skewness""" + def skew(self, **kwargs): - return self._apply('roll_skew', - check_minp=_require_min_periods(3), + return self._apply('roll_skew', check_minp=_require_min_periods(3), **kwargs) _shared_docs['kurt'] = """Unbiased %(name)s kurtosis""" + def kurt(self, **kwargs): - return self._apply('roll_kurt', - check_minp=_require_min_periods(4), + return self._apply('roll_kurt', check_minp=_require_min_periods(4), **kwargs) _shared_docs['quantile'] = dedent(""" @@ -604,6 +624,7 @@ def kurt(self, **kwargs): def quantile(self, quantile, **kwargs): window = self._get_window() + def f(arg, *args, **kwargs): minp = _use_window(self.min_periods, window) return algos.roll_quantile(arg, window, minp, quantile) @@ -618,11 +639,11 @@ def f(arg, *args, **kwargs): other : Series, DataFrame, or ndarray, optional if not supplied then will default to self and produce pairwise output pairwise : bool, default None - If False then only matching columns between self and other will be used and - the output will be a DataFrame. - If True then all pairwise combinations will be calculated and the output - will be a Panel in the case of DataFrame inputs. In the case of missing - elements, only complete pairwise observations will be used. + If False then only matching columns between self and other will be used + and the output will be a DataFrame. + If True then all pairwise combinations will be calculated and the + output will be a Panel in the case of DataFrame inputs. In the case of + missing elements, only complete pairwise observations will be used. ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations is ``N - ddof``, where ``N`` represents the number of elements.""") @@ -630,16 +651,21 @@ def f(arg, *args, **kwargs): def cov(self, other=None, pairwise=None, ddof=1, **kwargs): if other is None: other = self._selected_obj - pairwise = True if pairwise is None else pairwise # only default unset + # only default unset + pairwise = True if pairwise is None else pairwise other = self._shallow_copy(other) window = self._get_window(other) def _get_cov(X, Y): - mean = lambda x: x.rolling(window, self.min_periods, center=self.center).mean(**kwargs) - count = (X+Y).rolling(window=window, center=self.center).count(**kwargs) + mean = lambda x: x.rolling(window, self.min_periods, + center=self.center).mean(**kwargs) + count = (X + Y).rolling(window=window, + center=self.center).count(**kwargs) bias_adj = count / (count - ddof) return (mean(X * Y) - mean(X) * mean(Y)) * bias_adj - return _flex_binary_moment(self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise)) + + return _flex_binary_moment(self._selected_obj, other._selected_obj, + _get_cov, pairwise=bool(pairwise)) _shared_docs['corr'] = dedent(""" %(name)s sample correlation @@ -649,31 +675,31 @@ def _get_cov(X, Y): other : Series, DataFrame, or ndarray, optional if not supplied then will default to self and produce pairwise output pairwise : bool, default None - If False then only matching columns between self and other will be used and - the output will be a DataFrame. - If True then all pairwise combinations will be calculated and the output - will be a Panel in the case of DataFrame inputs. In the case of missing - elements, only complete pairwise observations will be used.""") + If False then only matching columns between self and other will be used + and the output will be a DataFrame. + If True then all pairwise combinations will be calculated and the + output will be a Panel in the case of DataFrame inputs. In the case of + missing elements, only complete pairwise observations will be used.""") def corr(self, other=None, pairwise=None, **kwargs): if other is None: other = self._selected_obj - pairwise = True if pairwise is None else pairwise # only default unset + # only default unset + pairwise = True if pairwise is None else pairwise other = self._shallow_copy(other) window = self._get_window(other) def _get_corr(a, b): - a = a.rolling(window=window, - min_periods=self.min_periods, - freq=self.freq, - center=self.center) - b = b.rolling(window=window, - min_periods=self.min_periods, - freq=self.freq, - center=self.center) + a = a.rolling(window=window, min_periods=self.min_periods, + freq=self.freq, center=self.center) + b = b.rolling(window=window, min_periods=self.min_periods, + freq=self.freq, center=self.center) return a.cov(b, **kwargs) / (a.std(**kwargs) * b.std(**kwargs)) - return _flex_binary_moment(self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise)) + + return _flex_binary_moment(self._selected_obj, other._selected_obj, + _get_corr, pairwise=bool(pairwise)) + class Rolling(_Rolling_and_Expanding): """ @@ -690,8 +716,8 @@ class Rolling(_Rolling_and_Expanding): Minimum number of observations in window required to have a value (otherwise result is NA). freq : string or DateOffset object, optional (default None) (DEPRECATED) - Frequency to conform the data to before computing the statistic. Specified - as a frequency string or DateOffset object. + Frequency to conform the data to before computing the statistic. + Specified as a frequency string or DateOffset object. center : boolean, default False Set the labels at the center of the window. axis : int, default 0 @@ -794,13 +820,16 @@ def quantile(self, quantile, **kwargs): @Appender(_doc_template) @Appender(_shared_docs['cov']) def cov(self, other=None, pairwise=None, ddof=1, **kwargs): - return super(Rolling, self).cov(other=other, pairwise=pairwise, ddof=ddof, **kwargs) + return super(Rolling, self).cov(other=other, pairwise=pairwise, + ddof=ddof, **kwargs) @Substitution(name='rolling') @Appender(_doc_template) @Appender(_shared_docs['corr']) def corr(self, other=None, pairwise=None, **kwargs): - return super(Rolling, self).corr(other=other, pairwise=pairwise, **kwargs) + return super(Rolling, self).corr(other=other, pairwise=pairwise, + **kwargs) + class Expanding(_Rolling_and_Expanding): """ @@ -814,8 +843,8 @@ class Expanding(_Rolling_and_Expanding): Minimum number of observations in window required to have a value (otherwise result is NA). freq : string or DateOffset object, optional (default None) (DEPRECATED) - Frequency to conform the data to before computing the statistic. Specified - as a frequency string or DateOffset object. + Frequency to conform the data to before computing the statistic. + Specified as a frequency string or DateOffset object. center : boolean, default False Set the labels at the center of the window. axis : int, default 0 @@ -834,10 +863,14 @@ class Expanding(_Rolling_and_Expanding): of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ - _attributes = ['min_periods','freq','center','axis'] + _attributes = ['min_periods', 'freq', 'center', 'axis'] - def __init__(self, obj, min_periods=1, freq=None, center=False, axis=0, **kwargs): - return super(Expanding, self).__init__(obj=obj, min_periods=min_periods, freq=freq, center=center, axis=axis) + def __init__(self, obj, min_periods=1, freq=None, center=False, axis=0, + **kwargs): + return super(Expanding, self).__init__(obj=obj, + min_periods=min_periods, + freq=freq, center=center, + axis=axis) @property def _constructor(self): @@ -846,8 +879,10 @@ def _constructor(self): def _get_window(self, other=None): obj = self._selected_obj if other is None: - return max(len(obj), self.min_periods) if self.min_periods else len(obj) - return max((len(obj) + len(obj)), self.min_periods) if self.min_periods else (len(obj) + len(obj)) + return (max(len(obj), self.min_periods) if self.min_periods + else len(obj)) + return (max((len(obj) + len(obj)), self.min_periods) + if self.min_periods else (len(obj) + len(obj))) @Substitution(name='expanding') @Appender(SelectionMixin._see_also_template) @@ -933,13 +968,16 @@ def quantile(self, quantile, **kwargs): @Appender(_doc_template) @Appender(_shared_docs['cov']) def cov(self, other=None, pairwise=None, ddof=1, **kwargs): - return super(Expanding, self).cov(other=other, pairwise=pairwise, ddof=ddof, **kwargs) + return super(Expanding, self).cov(other=other, pairwise=pairwise, + ddof=ddof, **kwargs) @Substitution(name='expanding') @Appender(_doc_template) @Appender(_shared_docs['corr']) def corr(self, other=None, pairwise=None, **kwargs): - return super(Expanding, self).corr(other=other, pairwise=pairwise, **kwargs) + return super(Expanding, self).corr(other=other, pairwise=pairwise, + **kwargs) + _bias_template = """ @@ -979,15 +1017,16 @@ class EWM(_Rolling): span : float, optional Specify decay in terms of span, :math:`\alpha = 2 / (span + 1)` halflife : float, optional - Specify decay in terms of halflife, :math:`\alpha = 1 - exp(log(0.5) / halflife)` + Specify decay in terms of halflife, + :math:`\alpha = 1 - exp(log(0.5) / halflife)` min_periods : int, default 0 Minimum number of observations in window required to have a value (otherwise result is NA). freq : None or string alias / date offset object, default=None (DEPRECATED) Frequency to conform to before computing statistic adjust : boolean, default True - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings (viewing EWMA as a moving average) + Divide by decaying adjustment factor in beginning periods to account + for imbalance in relative weightings (viewing EWMA as a moving average) ignore_na : boolean, default False Ignore missing values when calculating weights; specify True to reproduce pre-0.15.0 behavior @@ -1004,8 +1043,8 @@ class EWM(_Rolling): decay parameter :math:`\alpha` is related to the span as :math:`\alpha = 2 / (s + 1) = 1 / (1 + c)` - where `c` is the center of mass. Given a span, the associated center of mass is - :math:`c = (s - 1) / 2` + where `c` is the center of mass. Given a span, the associated center of + mass is :math:`c = (s - 1) / 2` So a "20-day EWMA" would have center 9.5. @@ -1013,8 +1052,8 @@ class EWM(_Rolling): frequency by resampling the data. This is done with the default parameters of :meth:`~pandas.Series.resample` (i.e. using the `mean`). - When adjust is True (default), weighted averages are calculated using weights - (1-alpha)**(n-1), (1-alpha)**(n-2), ..., 1-alpha, 1. + When adjust is True (default), weighted averages are calculated using + weights (1-alpha)**(n-1), (1-alpha)**(n-2), ..., 1-alpha, 1. When adjust is False, weighted averages are calculated recursively as: weighted_average[0] = arg[0]; @@ -1025,18 +1064,18 @@ class EWM(_Rolling): average of [x, None, y] are (1-alpha)**2 and 1 (if adjust is True), and (1-alpha)**2 and alpha (if adjust is False). - When ignore_na is True (reproducing pre-0.15.0 behavior), weights are based on - relative positions. For example, the weights of x and y used in calculating - the final weighted average of [x, None, y] are 1-alpha and 1 (if adjust is - True), and 1-alpha and alpha (if adjust is False). + When ignore_na is True (reproducing pre-0.15.0 behavior), weights are based + on relative positions. For example, the weights of x and y used in + calculating the final weighted average of [x, None, y] are 1-alpha and 1 + (if adjust is True), and 1-alpha and alpha (if adjust is False). More details can be found at http://pandas.pydata.org/pandas-docs/stable/computation.html#exponentially-weighted-moment-functions """ - _attributes = ['com','min_periods','freq','adjust','ignore_na','axis'] + _attributes = ['com', 'min_periods', 'freq', 'adjust', 'ignore_na', 'axis'] - def __init__(self, obj, com=None, span=None, halflife=None, min_periods=0, freq=None, - adjust=True, ignore_na=False, axis=0): + def __init__(self, obj, com=None, span=None, halflife=None, min_periods=0, + freq=None, adjust=True, ignore_na=False, axis=0): self.obj = obj self.com = _get_center_of_mass(com, span, halflife) self.min_periods = min_periods @@ -1088,11 +1127,14 @@ def _apply(self, func, how=None, **kwargs): # if we have a string function name, wrap it if isinstance(func, compat.string_types): if not hasattr(algos, func): - raise ValueError("we do not support this function algos.{0}".format(func)) + raise ValueError("we do not support this function " + "algos.{0}".format(func)) cfunc = getattr(algos, func) + def func(arg): - return cfunc(arg, self.com, int(self.adjust), int(self.ignore_na), int(self.min_periods)) + return cfunc(arg, self.com, int(self.adjust), + int(self.ignore_na), int(self.min_periods)) results.append(np.apply_along_axis(func, self.axis, values)) @@ -1110,20 +1152,18 @@ def mean(self, **kwargs): def std(self, bias=False, **kwargs): """exponential weighted moving stddev""" return _zsqrt(self.var(bias=bias, **kwargs)) - vol=std + + vol = std @Substitution(name='ewm') @Appender(_doc_template) @Appender(_bias_template) def var(self, bias=False, **kwargs): """exponential weighted moving variance""" + def f(arg): - return algos.ewmcov(arg, - arg, - self.com, - int(self.adjust), - int(self.ignore_na), - int(self.min_periods), + return algos.ewmcov(arg, arg, self.com, int(self.adjust), + int(self.ignore_na), int(self.min_periods), int(bias)) return self._apply(f, **kwargs) @@ -1135,22 +1175,20 @@ def cov(self, other=None, pairwise=None, bias=False, **kwargs): """exponential weighted sample covariance""" if other is None: other = self._selected_obj - pairwise = True if pairwise is None else pairwise # only default unset + # only default unset + pairwise = True if pairwise is None else pairwise other = self._shallow_copy(other) def _get_cov(X, Y): X = self._shallow_copy(X) Y = self._shallow_copy(Y) - cov = algos.ewmcov(X._prep_values(), - Y._prep_values(), - self.com, - int(self.adjust), - int(self.ignore_na), - int(self.min_periods), - int(bias)) + cov = algos.ewmcov(X._prep_values(), Y._prep_values(), self.com, + int(self.adjust), int(self.ignore_na), + int(self.min_periods), int(bias)) return X._wrap_result(cov) - return _flex_binary_moment(self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise)) + return _flex_binary_moment(self._selected_obj, other._selected_obj, + _get_cov, pairwise=bool(pairwise)) @Substitution(name='ewm') @Appender(_doc_template) @@ -1159,14 +1197,18 @@ def corr(self, other=None, pairwise=None, **kwargs): """exponential weighted sample correlation""" if other is None: other = self._selected_obj - pairwise = True if pairwise is None else pairwise # only default unset + # only default unset + pairwise = True if pairwise is None else pairwise other = self._shallow_copy(other) def _get_corr(X, Y): X = self._shallow_copy(X) Y = self._shallow_copy(Y) + def _cov(x, y): - return algos.ewmcov(x, y, self.com, int(self.adjust), int(self.ignore_na), int(self.min_periods), 1) + return algos.ewmcov(x, y, self.com, int(self.adjust), + int(self.ignore_na), int(self.min_periods), + 1) x_values = X._prep_values() y_values = Y._prep_values() @@ -1176,25 +1218,26 @@ def _cov(x, y): corr = cov / _zsqrt(x_var * y_var) return X._wrap_result(corr) - return _flex_binary_moment(self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise)) + return _flex_binary_moment(self._selected_obj, other._selected_obj, + _get_corr, pairwise=bool(pairwise)) + +# Helper Funcs -######################## -##### Helper Funcs ##### -######################## def _flex_binary_moment(arg1, arg2, f, pairwise=False): from pandas import Series, DataFrame, Panel - if not (isinstance(arg1,(np.ndarray, Series, DataFrame)) and - isinstance(arg2,(np.ndarray, Series, DataFrame))): + if not (isinstance(arg1, (np.ndarray, Series, DataFrame)) and + isinstance(arg2, (np.ndarray, Series, DataFrame))): raise TypeError("arguments to moment function must be of type " - "np.ndarray/Series/DataFrame") + "np.ndarray/Series/DataFrame") - if isinstance(arg1, (np.ndarray, Series)) and \ - isinstance(arg2, (np.ndarray,Series)): + if (isinstance(arg1, (np.ndarray, Series)) and + isinstance(arg2, (np.ndarray, Series))): X, Y = _prep_binary(arg1, arg2) return f(X, Y) elif isinstance(arg1, DataFrame): + def dataframe_from_int_dict(data, frame_template): result = DataFrame(data, index=frame_template.index) if len(result.columns) > 0: @@ -1221,16 +1264,18 @@ def dataframe_from_int_dict(data, frame_template): for col in res_columns: if col in X and col in Y: results[col] = f(X[col], Y[col]) - return DataFrame(results, index=X.index, columns=res_columns) + return DataFrame(results, index=X.index, + columns=res_columns) elif pairwise is True: results = defaultdict(dict) for i, k1 in enumerate(arg1.columns): for j, k2 in enumerate(arg2.columns): - if j 0: p.major_axis = arg1.columns[p.major_axis] @@ -1248,6 +1293,7 @@ def dataframe_from_int_dict(data, frame_template): else: return _flex_binary_moment(arg2, arg1, f) + def _get_center_of_mass(com, span, halflife): valid_count = len([x for x in [com, span, halflife] if x is not None]) if valid_count > 1: @@ -1265,6 +1311,7 @@ def _get_center_of_mass(com, span, halflife): return float(com) + def _offset(window, center): if not com.is_integer(window): window = len(window) @@ -1274,20 +1321,24 @@ def _offset(window, center): except: return offset.astype(int) + def _require_min_periods(p): def _check_func(minp, window): if minp is None: return window else: return max(p, minp) + return _check_func + def _use_window(minp, window): if minp is None: return window else: return minp + def _zsqrt(x): result = np.sqrt(x) mask = x < 0 @@ -1302,6 +1353,7 @@ def _zsqrt(x): return result + def _prep_binary(arg1, arg2): if not isinstance(arg2, type(arg1)): raise Exception('Input arrays must be of the same type!') @@ -1312,6 +1364,7 @@ def _prep_binary(arg1, arg2): return X, Y + def _validate_win_type(win_type, kwargs): # may pop from kwargs arg_map = {'kaiser': ['beta'], @@ -1319,8 +1372,8 @@ def _validate_win_type(win_type, kwargs): 'general_gaussian': ['power', 'width'], 'slepian': ['width']} if win_type in arg_map: - return tuple([win_type] + - _pop_args(win_type, arg_map[win_type], kwargs)) + return tuple([win_type] + _pop_args(win_type, arg_map[win_type], + kwargs)) return win_type @@ -1333,9 +1386,9 @@ def _pop_args(win_type, arg_names, kwargs): all_args.append(kwargs.pop(n)) return all_args -############################# -##### top-level exports ##### -############################# + +# Top-level exports + def rolling(obj, win_type=None, **kwds): from pandas import Series, DataFrame @@ -1346,20 +1399,28 @@ def rolling(obj, win_type=None, **kwds): return Window(obj, win_type=win_type, **kwds) return Rolling(obj, **kwds) + + rolling.__doc__ = Window.__doc__ + def expanding(obj, **kwds): from pandas import Series, DataFrame if not isinstance(obj, (Series, DataFrame)): raise TypeError('invalid type: %s' % type(obj)) return Expanding(obj, **kwds) + + expanding.__doc__ = Expanding.__doc__ + def ewm(obj, **kwds): from pandas import Series, DataFrame if not isinstance(obj, (Series, DataFrame)): raise TypeError('invalid type: %s' % type(obj)) return EWM(obj, **kwds) + + ewm.__doc__ = EWM.__doc__