Skip to content

PERF: improve perf of writing csv's with datetimes #9940

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 20, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.16.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ API changes
Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~


- Improved csv write performance with mixed dtypes, including datetimes by up to 5x (:issue:`9940`)
- Improved csv write performance generally by 2x (:issue:`9940`)



Expand Down
99 changes: 58 additions & 41 deletions pandas/core/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,14 @@
from pandas.core.config import get_option, set_option
import pandas.core.common as com
import pandas.lib as lib
from pandas.tslib import iNaT, Timestamp, Timedelta

from pandas.tslib import iNaT, Timestamp, Timedelta, format_array_from_datetime
from pandas.tseries.index import DatetimeIndex
from pandas.tseries.period import PeriodIndex
import numpy as np

import itertools
import csv

from pandas.tseries.period import PeriodIndex, DatetimeIndex

docstring_to_string = """
Parameters
----------
Expand Down Expand Up @@ -1259,9 +1258,10 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None,
if isinstance(cols, Index):
cols = cols.to_native_types(na_rep=na_rep,
float_format=float_format,
date_format=date_format)
date_format=date_format,
quoting=self.quoting)
else:
cols = list(cols)
cols = np.asarray(list(cols))
self.obj = self.obj.loc[:, cols]

# update columns to include possible multiplicity of dupes
Expand All @@ -1270,9 +1270,10 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None,
if isinstance(cols, Index):
cols = cols.to_native_types(na_rep=na_rep,
float_format=float_format,
date_format=date_format)
date_format=date_format,
quoting=self.quoting)
else:
cols = list(cols)
cols = np.asarray(list(cols))

# save it
self.cols = cols
Expand Down Expand Up @@ -1371,8 +1372,10 @@ def strftime_with_nulls(x):
values = self.obj.copy()
values.index = data_index
values.columns = values.columns.to_native_types(
na_rep=na_rep, float_format=float_format,
date_format=date_format)
na_rep=na_rep,
float_format=float_format,
date_format=date_format,
quoting=self.quoting)
values = values[cols]

series = {}
Expand Down Expand Up @@ -1543,18 +1546,22 @@ def _save_chunk(self, start_i, end_i):
slicer = slice(start_i, end_i)
for i in range(len(self.blocks)):
b = self.blocks[i]
d = b.to_native_types(slicer=slicer, na_rep=self.na_rep,
d = b.to_native_types(slicer=slicer,
na_rep=self.na_rep,
float_format=self.float_format,
decimal=self.decimal,
date_format=self.date_format)
date_format=self.date_format,
quoting=self.quoting)

for col_loc, col in zip(b.mgr_locs, d):
# self.data is a preallocated list
self.data[col_loc] = col

ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep,
ix = data_index.to_native_types(slicer=slicer,
na_rep=self.na_rep,
float_format=self.float_format,
date_format=self.date_format)
date_format=self.date_format,
quoting=self.quoting)

lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer)

Expand Down Expand Up @@ -2030,16 +2037,43 @@ def __init__(self, values, nat_rep='NaT', date_format=None, **kwargs):
self.date_format = date_format

def _format_strings(self):
formatter = (self.formatter or
_get_format_datetime64_from_values(self.values,
nat_rep=self.nat_rep,
date_format=self.date_format))

fmt_values = [formatter(x) for x in self.values]
# we may have a tz, if so, then need to process element-by-element
# when DatetimeBlockWithTimezones is a reality this could be fixed
values = self.values
if not isinstance(values, DatetimeIndex):
values = DatetimeIndex(values)

if values.tz is None:
fmt_values = format_array_from_datetime(values.asi8.ravel(),
format=_get_format_datetime64_from_values(values, self.date_format),
na_rep=self.nat_rep).reshape(values.shape)
fmt_values = fmt_values.tolist()

else:

values = values.asobject
is_dates_only = _is_dates_only(values)
formatter = (self.formatter or _get_format_datetime64(is_dates_only, values, date_format=self.date_format))
fmt_values = [ formatter(x) for x in self.values ]

return fmt_values


def _is_dates_only(values):
# return a boolean if we are only dates (and don't have a timezone)
values = DatetimeIndex(values)
if values.tz is not None:
return False

values_int = values.asi8
consider_values = values_int != iNaT
one_day_nanos = (86400 * 1e9)
even_days = np.logical_and(consider_values, values_int % one_day_nanos != 0).sum() == 0
if even_days:
return True
return False

def _format_datetime64(x, tz=None, nat_rep='NaT'):
if x is None or lib.checknull(x):
return nat_rep
Expand All @@ -2062,22 +2096,6 @@ def _format_datetime64_dateonly(x, nat_rep='NaT', date_format=None):
else:
return x._date_repr


def _is_dates_only(values):
# return a boolean if we are only dates (and don't have a timezone)
from pandas import DatetimeIndex
values = DatetimeIndex(values)
if values.tz is not None:
return False

values_int = values.asi8
consider_values = values_int != iNaT
one_day_nanos = (86400 * 1e9)
even_days = np.logical_and(consider_values, values_int % one_day_nanos != 0).sum() == 0
if even_days:
return True
return False

def _get_format_datetime64(is_dates_only, nat_rep='NaT', date_format=None):

if is_dates_only:
Expand All @@ -2088,13 +2106,12 @@ def _get_format_datetime64(is_dates_only, nat_rep='NaT', date_format=None):
return lambda x, tz=None: _format_datetime64(x, tz=tz, nat_rep=nat_rep)


def _get_format_datetime64_from_values(values,
nat_rep='NaT',
date_format=None):
def _get_format_datetime64_from_values(values, date_format):
""" given values and a date_format, return a string format """
is_dates_only = _is_dates_only(values)
return _get_format_datetime64(is_dates_only=is_dates_only,
nat_rep=nat_rep,
date_format=date_format)
if is_dates_only:
return date_format or "%Y-%m-%d"
return None


class Timedelta64Formatter(GenericArrayFormatter):
Expand Down
12 changes: 8 additions & 4 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1071,12 +1071,16 @@ def to_native_types(self, slicer=None, **kwargs):
values = values[slicer]
return values._format_native_types(**kwargs)

def _format_native_types(self, na_rep='', **kwargs):
def _format_native_types(self, na_rep='', quoting=None, **kwargs):
""" actually format my specific types """
mask = isnull(self)
values = np.array(self, dtype=object, copy=True)
if not self.is_object() and not quoting:
values = np.asarray(self).astype(str)
else:
values = np.array(self, dtype=object, copy=True)

values[mask] = na_rep
return values.tolist()
return values

def equals(self, other):
"""
Expand Down Expand Up @@ -3298,7 +3302,7 @@ def _reference_duplicate_name(self, name):
return np.sum(name == np.asarray(self.names)) > 1

def _format_native_types(self, **kwargs):
return self.tolist()
return self.values

@property
def _constructor(self):
Expand Down
62 changes: 31 additions & 31 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,16 +484,21 @@ def _try_coerce_and_cast_result(self, result, dtype=None):
def _try_fill(self, value):
return value

def to_native_types(self, slicer=None, na_rep='', **kwargs):
def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs):
""" convert to our native types format, slicing if desired """

values = self.values
if slicer is not None:
values = values[:, slicer]
values = np.array(values, dtype=object)
mask = isnull(values)

if not self.is_object and not quoting:
values = values.astype(str)
else:
values = np.array(values, dtype='object')

values[mask] = na_rep
return values.tolist()
return values

# block actions ####
def copy(self, deep=True):
Expand Down Expand Up @@ -1221,32 +1226,34 @@ def _try_cast(self, element):
return element

def to_native_types(self, slicer=None, na_rep='', float_format=None, decimal='.',
**kwargs):
quoting=None, **kwargs):
""" convert to our native types format, slicing if desired """

values = self.values
if slicer is not None:
values = values[:, slicer]
values = np.array(values, dtype=object)
mask = isnull(values)
values[mask] = na_rep


formatter = None
if float_format and decimal != '.':
formatter = lambda v : (float_format % v).replace('.',decimal,1)
elif decimal != '.':
formatter = lambda v : ('%g' % v).replace('.',decimal,1)
elif float_format:
formatter = lambda v : float_format % v

if formatter is None and not quoting:
values = values.astype(str)
else:
formatter = None
values = np.array(values, dtype='object')

values[mask] = na_rep
if formatter:
imask = (~mask).ravel()
values.flat[imask] = np.array(
[formatter(val) for val in values.ravel()[imask]])

return values.tolist()
return values

def should_store(self, value):
# when inserting a column should not coerce integers to floats
Expand Down Expand Up @@ -1366,7 +1373,7 @@ def _try_coerce_result(self, result):
def should_store(self, value):
return issubclass(value.dtype.type, np.timedelta64)

def to_native_types(self, slicer=None, na_rep=None, **kwargs):
def to_native_types(self, slicer=None, na_rep=None, quoting=None, **kwargs):
""" convert to our native types format, slicing if desired """

values = self.values
Expand All @@ -1387,7 +1394,7 @@ def to_native_types(self, slicer=None, na_rep=None, **kwargs):
rvalues.flat[imask] = np.array([Timedelta(val)._repr_base(format='all')
for val in values.ravel()[imask]],
dtype=object)
return rvalues.tolist()
return rvalues


def get_values(self, dtype=None):
Expand Down Expand Up @@ -1763,18 +1770,19 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
ndim=self.ndim,
placement=self.mgr_locs)

def to_native_types(self, slicer=None, na_rep='', **kwargs):
def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs):
""" convert to our native types format, slicing if desired """

values = self.values
if slicer is not None:
# Categorical is always one dimension
values = values[slicer]
values = np.array(values, dtype=object)
mask = isnull(values)
values = np.array(values, dtype='object')
values[mask] = na_rep
# Blocks.to_native_type returns list of lists, but we are always only a list
return [values.tolist()]

# we are expected to return a 2-d ndarray
return values.reshape(1,len(values))

class DatetimeBlock(Block):
__slots__ = ()
Expand Down Expand Up @@ -1864,29 +1872,21 @@ def fillna(self, value, limit=None,
fastpath=True, placement=self.mgr_locs)]

def to_native_types(self, slicer=None, na_rep=None, date_format=None,
**kwargs):
quoting=None, **kwargs):
""" convert to our native types format, slicing if desired """

values = self.values
if slicer is not None:
values = values[:, slicer]
mask = isnull(values)

rvalues = np.empty(values.shape, dtype=object)
if na_rep is None:
na_rep = 'NaT'
rvalues[mask] = na_rep
imask = (~mask).ravel()

if date_format is None:
date_formatter = lambda x: Timestamp(x)._repr_base
else:
date_formatter = lambda x: Timestamp(x).strftime(date_format)
from pandas.core.format import _get_format_datetime64_from_values
format = _get_format_datetime64_from_values(values, date_format)

rvalues.flat[imask] = np.array([date_formatter(val) for val in
values.ravel()[imask]], dtype=object)

return rvalues.tolist()
result = tslib.format_array_from_datetime(values.view('i8').ravel(),
tz=None,
format=format,
na_rep=na_rep).reshape(values.shape)
return result

def should_store(self, value):
return issubclass(value.dtype.type, np.datetime64)
Expand Down
2 changes: 1 addition & 1 deletion pandas/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -933,7 +933,7 @@ def string_array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_re

@cython.boundscheck(False)
@cython.wraparound(False)
def write_csv_rows(list data, list data_index, int nlevels, list cols, object writer):
def write_csv_rows(list data, ndarray data_index, int nlevels, ndarray cols, object writer):

cdef int N, j, i, ncols
cdef list rows
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -3010,12 +3010,12 @@ def test_format(self):

def test_output_significant_digits(self):
# Issue #9764

# In case default display precision changes:
with pd.option_context('display.precision', 7):
# DataFrame example from issue #9764
d=pd.DataFrame({'col1':[9.999e-8, 1e-7, 1.0001e-7, 2e-7, 4.999e-7, 5e-7, 5.0001e-7, 6e-7, 9.999e-7, 1e-6, 1.0001e-6, 2e-6, 4.999e-6, 5e-6, 5.0001e-6, 6e-6]})

expected_output={
(0,6):' col1\n0 9.999000e-08\n1 1.000000e-07\n2 1.000100e-07\n3 2.000000e-07\n4 4.999000e-07\n5 5.000000e-07',
(1,6):' col1\n1 1.000000e-07\n2 1.000100e-07\n3 2.000000e-07\n4 4.999000e-07\n5 5.000000e-07',
Expand Down
2 changes: 1 addition & 1 deletion pandas/tseries/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def groupby(self, f):
return _algos.groupby_object(objs, f)

def _format_with_header(self, header, **kwargs):
return header + self._format_native_types(**kwargs)
return header + list(self._format_native_types(**kwargs))

def __contains__(self, key):
try:
Expand Down
Loading