diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 7166801b3fbf0..c2992f2e6c7d2 100755 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -89,7 +89,8 @@ API changes Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - +- Improved csv write performance with mixed dtypes, including datetimes by up to 5x (:issue:`9940`) +- Improved csv write performance generally by 2x (:issue:`9940`) diff --git a/pandas/core/format.py b/pandas/core/format.py index 06e1fab27cd6d..6e632e6ea741b 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -14,15 +14,14 @@ from pandas.core.config import get_option, set_option import pandas.core.common as com import pandas.lib as lib -from pandas.tslib import iNaT, Timestamp, Timedelta - +from pandas.tslib import iNaT, Timestamp, Timedelta, format_array_from_datetime +from pandas.tseries.index import DatetimeIndex +from pandas.tseries.period import PeriodIndex import numpy as np import itertools import csv -from pandas.tseries.period import PeriodIndex, DatetimeIndex - docstring_to_string = """ Parameters ---------- @@ -1259,9 +1258,10 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, if isinstance(cols, Index): cols = cols.to_native_types(na_rep=na_rep, float_format=float_format, - date_format=date_format) + date_format=date_format, + quoting=self.quoting) else: - cols = list(cols) + cols = np.asarray(list(cols)) self.obj = self.obj.loc[:, cols] # update columns to include possible multiplicity of dupes @@ -1270,9 +1270,10 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, if isinstance(cols, Index): cols = cols.to_native_types(na_rep=na_rep, float_format=float_format, - date_format=date_format) + date_format=date_format, + quoting=self.quoting) else: - cols = list(cols) + cols = np.asarray(list(cols)) # save it self.cols = cols @@ -1371,8 +1372,10 @@ def strftime_with_nulls(x): values = self.obj.copy() values.index = data_index values.columns = values.columns.to_native_types( - na_rep=na_rep, float_format=float_format, - date_format=date_format) + na_rep=na_rep, + float_format=float_format, + date_format=date_format, + quoting=self.quoting) values = values[cols] series = {} @@ -1543,18 +1546,22 @@ def _save_chunk(self, start_i, end_i): slicer = slice(start_i, end_i) for i in range(len(self.blocks)): b = self.blocks[i] - d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, + d = b.to_native_types(slicer=slicer, + na_rep=self.na_rep, float_format=self.float_format, decimal=self.decimal, - date_format=self.date_format) + date_format=self.date_format, + quoting=self.quoting) for col_loc, col in zip(b.mgr_locs, d): # self.data is a preallocated list self.data[col_loc] = col - ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, + ix = data_index.to_native_types(slicer=slicer, + na_rep=self.na_rep, float_format=self.float_format, - date_format=self.date_format) + date_format=self.date_format, + quoting=self.quoting) lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer) @@ -2030,16 +2037,43 @@ def __init__(self, values, nat_rep='NaT', date_format=None, **kwargs): self.date_format = date_format def _format_strings(self): - formatter = (self.formatter or - _get_format_datetime64_from_values(self.values, - nat_rep=self.nat_rep, - date_format=self.date_format)) - fmt_values = [formatter(x) for x in self.values] + # we may have a tz, if so, then need to process element-by-element + # when DatetimeBlockWithTimezones is a reality this could be fixed + values = self.values + if not isinstance(values, DatetimeIndex): + values = DatetimeIndex(values) + + if values.tz is None: + fmt_values = format_array_from_datetime(values.asi8.ravel(), + format=_get_format_datetime64_from_values(values, self.date_format), + na_rep=self.nat_rep).reshape(values.shape) + fmt_values = fmt_values.tolist() + + else: + + values = values.asobject + is_dates_only = _is_dates_only(values) + formatter = (self.formatter or _get_format_datetime64(is_dates_only, values, date_format=self.date_format)) + fmt_values = [ formatter(x) for x in self.values ] return fmt_values +def _is_dates_only(values): + # return a boolean if we are only dates (and don't have a timezone) + values = DatetimeIndex(values) + if values.tz is not None: + return False + + values_int = values.asi8 + consider_values = values_int != iNaT + one_day_nanos = (86400 * 1e9) + even_days = np.logical_and(consider_values, values_int % one_day_nanos != 0).sum() == 0 + if even_days: + return True + return False + def _format_datetime64(x, tz=None, nat_rep='NaT'): if x is None or lib.checknull(x): return nat_rep @@ -2062,22 +2096,6 @@ def _format_datetime64_dateonly(x, nat_rep='NaT', date_format=None): else: return x._date_repr - -def _is_dates_only(values): - # return a boolean if we are only dates (and don't have a timezone) - from pandas import DatetimeIndex - values = DatetimeIndex(values) - if values.tz is not None: - return False - - values_int = values.asi8 - consider_values = values_int != iNaT - one_day_nanos = (86400 * 1e9) - even_days = np.logical_and(consider_values, values_int % one_day_nanos != 0).sum() == 0 - if even_days: - return True - return False - def _get_format_datetime64(is_dates_only, nat_rep='NaT', date_format=None): if is_dates_only: @@ -2088,13 +2106,12 @@ def _get_format_datetime64(is_dates_only, nat_rep='NaT', date_format=None): return lambda x, tz=None: _format_datetime64(x, tz=tz, nat_rep=nat_rep) -def _get_format_datetime64_from_values(values, - nat_rep='NaT', - date_format=None): +def _get_format_datetime64_from_values(values, date_format): + """ given values and a date_format, return a string format """ is_dates_only = _is_dates_only(values) - return _get_format_datetime64(is_dates_only=is_dates_only, - nat_rep=nat_rep, - date_format=date_format) + if is_dates_only: + return date_format or "%Y-%m-%d" + return None class Timedelta64Formatter(GenericArrayFormatter): diff --git a/pandas/core/index.py b/pandas/core/index.py index 0a3adbd19ae92..8b509c6876ec7 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1071,12 +1071,16 @@ def to_native_types(self, slicer=None, **kwargs): values = values[slicer] return values._format_native_types(**kwargs) - def _format_native_types(self, na_rep='', **kwargs): + def _format_native_types(self, na_rep='', quoting=None, **kwargs): """ actually format my specific types """ mask = isnull(self) - values = np.array(self, dtype=object, copy=True) + if not self.is_object() and not quoting: + values = np.asarray(self).astype(str) + else: + values = np.array(self, dtype=object, copy=True) + values[mask] = na_rep - return values.tolist() + return values def equals(self, other): """ @@ -3298,7 +3302,7 @@ def _reference_duplicate_name(self, name): return np.sum(name == np.asarray(self.names)) > 1 def _format_native_types(self, **kwargs): - return self.tolist() + return self.values @property def _constructor(self): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 7cc7bc390bcbb..864dc0dd46de2 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -484,16 +484,21 @@ def _try_coerce_and_cast_result(self, result, dtype=None): def _try_fill(self, value): return value - def to_native_types(self, slicer=None, na_rep='', **kwargs): + def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values if slicer is not None: values = values[:, slicer] - values = np.array(values, dtype=object) mask = isnull(values) + + if not self.is_object and not quoting: + values = values.astype(str) + else: + values = np.array(values, dtype='object') + values[mask] = na_rep - return values.tolist() + return values # block actions #### def copy(self, deep=True): @@ -1221,32 +1226,34 @@ def _try_cast(self, element): return element def to_native_types(self, slicer=None, na_rep='', float_format=None, decimal='.', - **kwargs): + quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values if slicer is not None: values = values[:, slicer] - values = np.array(values, dtype=object) mask = isnull(values) - values[mask] = na_rep - + formatter = None if float_format and decimal != '.': formatter = lambda v : (float_format % v).replace('.',decimal,1) elif decimal != '.': formatter = lambda v : ('%g' % v).replace('.',decimal,1) elif float_format: formatter = lambda v : float_format % v + + if formatter is None and not quoting: + values = values.astype(str) else: - formatter = None + values = np.array(values, dtype='object') + values[mask] = na_rep if formatter: imask = (~mask).ravel() values.flat[imask] = np.array( [formatter(val) for val in values.ravel()[imask]]) - return values.tolist() + return values def should_store(self, value): # when inserting a column should not coerce integers to floats @@ -1366,7 +1373,7 @@ def _try_coerce_result(self, result): def should_store(self, value): return issubclass(value.dtype.type, np.timedelta64) - def to_native_types(self, slicer=None, na_rep=None, **kwargs): + def to_native_types(self, slicer=None, na_rep=None, quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values @@ -1387,7 +1394,7 @@ def to_native_types(self, slicer=None, na_rep=None, **kwargs): rvalues.flat[imask] = np.array([Timedelta(val)._repr_base(format='all') for val in values.ravel()[imask]], dtype=object) - return rvalues.tolist() + return rvalues def get_values(self, dtype=None): @@ -1763,18 +1770,19 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, ndim=self.ndim, placement=self.mgr_locs) - def to_native_types(self, slicer=None, na_rep='', **kwargs): + def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values if slicer is not None: # Categorical is always one dimension values = values[slicer] - values = np.array(values, dtype=object) mask = isnull(values) + values = np.array(values, dtype='object') values[mask] = na_rep - # Blocks.to_native_type returns list of lists, but we are always only a list - return [values.tolist()] + + # we are expected to return a 2-d ndarray + return values.reshape(1,len(values)) class DatetimeBlock(Block): __slots__ = () @@ -1864,29 +1872,21 @@ def fillna(self, value, limit=None, fastpath=True, placement=self.mgr_locs)] def to_native_types(self, slicer=None, na_rep=None, date_format=None, - **kwargs): + quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values if slicer is not None: values = values[:, slicer] - mask = isnull(values) - - rvalues = np.empty(values.shape, dtype=object) - if na_rep is None: - na_rep = 'NaT' - rvalues[mask] = na_rep - imask = (~mask).ravel() - if date_format is None: - date_formatter = lambda x: Timestamp(x)._repr_base - else: - date_formatter = lambda x: Timestamp(x).strftime(date_format) + from pandas.core.format import _get_format_datetime64_from_values + format = _get_format_datetime64_from_values(values, date_format) - rvalues.flat[imask] = np.array([date_formatter(val) for val in - values.ravel()[imask]], dtype=object) - - return rvalues.tolist() + result = tslib.format_array_from_datetime(values.view('i8').ravel(), + tz=None, + format=format, + na_rep=na_rep).reshape(values.shape) + return result def should_store(self, value): return issubclass(value.dtype.type, np.datetime64) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 5ab2ee4327177..0d53b19425c2f 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -933,7 +933,7 @@ def string_array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_re @cython.boundscheck(False) @cython.wraparound(False) -def write_csv_rows(list data, list data_index, int nlevels, list cols, object writer): +def write_csv_rows(list data, ndarray data_index, int nlevels, ndarray cols, object writer): cdef int N, j, i, ncols cdef list rows diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index e3455d2449b55..b557594e8e7ef 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -3010,12 +3010,12 @@ def test_format(self): def test_output_significant_digits(self): # Issue #9764 - + # In case default display precision changes: with pd.option_context('display.precision', 7): # DataFrame example from issue #9764 d=pd.DataFrame({'col1':[9.999e-8, 1e-7, 1.0001e-7, 2e-7, 4.999e-7, 5e-7, 5.0001e-7, 6e-7, 9.999e-7, 1e-6, 1.0001e-6, 2e-6, 4.999e-6, 5e-6, 5.0001e-6, 6e-6]}) - + expected_output={ (0,6):' col1\n0 9.999000e-08\n1 1.000000e-07\n2 1.000100e-07\n3 2.000000e-07\n4 4.999000e-07\n5 5.000000e-07', (1,6):' col1\n1 1.000000e-07\n2 1.000100e-07\n3 2.000000e-07\n4 4.999000e-07\n5 5.000000e-07', diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 2b37c64940170..9da71423b3daa 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -61,7 +61,7 @@ def groupby(self, f): return _algos.groupby_object(objs, f) def _format_with_header(self, header, **kwargs): - return header + self._format_native_types(**kwargs) + return header + list(self._format_native_types(**kwargs)) def __contains__(self, key): try: diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 7dac36a9ae5cc..7b0ff578b0d90 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -673,12 +673,13 @@ def _add_delta(self, delta): def _format_native_types(self, na_rep=u('NaT'), date_format=None, **kwargs): - data = self.asobject - from pandas.core.format import Datetime64Formatter - return Datetime64Formatter(values=data, - nat_rep=na_rep, - date_format=date_format, - justify='all').get_result() + from pandas.core.format import _get_format_datetime64_from_values + format = _get_format_datetime64_from_values(self, date_format) + + return tslib.format_array_from_datetime(self.asi8, + tz=self.tz, + format=format, + na_rep=na_rep) def to_datetime(self, dayfirst=False): return self.copy() diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index b1f0ba1f127fa..a4b754f5a6bbd 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -387,7 +387,7 @@ def to_datetime(self, dayfirst=False): qyear = _field_accessor('qyear', 1) days_in_month = _field_accessor('days_in_month', 11, "The number of days in the month") daysinmonth = days_in_month - + def _get_object_array(self): freq = self.freq return np.array([ Period._from_ordinal(ordinal=x, freq=freq) for x in self.values], copy=False) @@ -687,7 +687,7 @@ def _format_native_types(self, na_rep=u('NaT'), **kwargs): imask = ~mask values[imask] = np.array([u('%s') % dt for dt in values[imask]]) - return values.tolist() + return values def __array_finalize__(self, obj): if not self.ndim: # pragma: no cover diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 7580fa5489e15..40dbbd7584c7a 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -1398,6 +1398,69 @@ def parse_datetime_string(date_string, **kwargs): dt = parse_date(date_string, **kwargs) return dt +def format_array_from_datetime(ndarray[int64_t] values, object tz=None, object format=None, object na_rep=None): + """ + return a np object array of the string formatted values + + Parameters + ---------- + values : a 1-d i8 array + tz : the timezone (or None) + format : optional, default is None + a strftime capable string + na_rep : optional, default is None + a nat format + + """ + cdef: + int64_t val, ns, N = len(values) + ndarray[object] result = np.empty(N, dtype=object) + object ts, res + pandas_datetimestruct dts + + if na_rep is None: + na_rep = 'NaT' + + for i in range(N): + val = values[i] + + if val == iNaT: + result[i] = na_rep + else: + if format is None and tz is None: + + pandas_datetime_to_datetimestruct(val, PANDAS_FR_ns, &dts) + res = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (dts.year, + dts.month, + dts.day, + dts.hour, + dts.min, + dts.sec) + + ns = dts.ps / 1000 + + if ns != 0: + res += '.%.9d' % (ns + 1000 * dts.us) + elif dts.us != 0: + res += '.%.6d' % dts.us + + result[i] = res + + else: + ts = Timestamp(val, tz=tz) + if format is None: + result[i] = str(ts) + else: + + # invalid format string + # requires dates > 1900 + try: + result[i] = ts.strftime(format) + except ValueError: + result[i] = str(ts) + + return result + def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, format=None, utc=None, coerce=False, unit=None): cdef: