Skip to content

Commit 85059a4

Browse files
committed
Merge pull request #9940 from jreback/csv_dt
PERF: improve perf of writing csv's with datetimes
2 parents db19f2d + 3d54482 commit 85059a4

File tree

10 files changed

+175
-89
lines changed

10 files changed

+175
-89
lines changed

doc/source/whatsnew/v0.16.1.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,8 @@ API changes
8989
Performance Improvements
9090
~~~~~~~~~~~~~~~~~~~~~~~~
9191

92-
92+
- Improved csv write performance with mixed dtypes, including datetimes by up to 5x (:issue:`9940`)
93+
- Improved csv write performance generally by 2x (:issue:`9940`)
9394

9495

9596

pandas/core/format.py

Lines changed: 58 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,14 @@
1414
from pandas.core.config import get_option, set_option
1515
import pandas.core.common as com
1616
import pandas.lib as lib
17-
from pandas.tslib import iNaT, Timestamp, Timedelta
18-
17+
from pandas.tslib import iNaT, Timestamp, Timedelta, format_array_from_datetime
18+
from pandas.tseries.index import DatetimeIndex
19+
from pandas.tseries.period import PeriodIndex
1920
import numpy as np
2021

2122
import itertools
2223
import csv
2324

24-
from pandas.tseries.period import PeriodIndex, DatetimeIndex
25-
2625
docstring_to_string = """
2726
Parameters
2827
----------
@@ -1259,9 +1258,10 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None,
12591258
if isinstance(cols, Index):
12601259
cols = cols.to_native_types(na_rep=na_rep,
12611260
float_format=float_format,
1262-
date_format=date_format)
1261+
date_format=date_format,
1262+
quoting=self.quoting)
12631263
else:
1264-
cols = list(cols)
1264+
cols = np.asarray(list(cols))
12651265
self.obj = self.obj.loc[:, cols]
12661266

12671267
# update columns to include possible multiplicity of dupes
@@ -1270,9 +1270,10 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None,
12701270
if isinstance(cols, Index):
12711271
cols = cols.to_native_types(na_rep=na_rep,
12721272
float_format=float_format,
1273-
date_format=date_format)
1273+
date_format=date_format,
1274+
quoting=self.quoting)
12741275
else:
1275-
cols = list(cols)
1276+
cols = np.asarray(list(cols))
12761277

12771278
# save it
12781279
self.cols = cols
@@ -1371,8 +1372,10 @@ def strftime_with_nulls(x):
13711372
values = self.obj.copy()
13721373
values.index = data_index
13731374
values.columns = values.columns.to_native_types(
1374-
na_rep=na_rep, float_format=float_format,
1375-
date_format=date_format)
1375+
na_rep=na_rep,
1376+
float_format=float_format,
1377+
date_format=date_format,
1378+
quoting=self.quoting)
13761379
values = values[cols]
13771380

13781381
series = {}
@@ -1543,18 +1546,22 @@ def _save_chunk(self, start_i, end_i):
15431546
slicer = slice(start_i, end_i)
15441547
for i in range(len(self.blocks)):
15451548
b = self.blocks[i]
1546-
d = b.to_native_types(slicer=slicer, na_rep=self.na_rep,
1549+
d = b.to_native_types(slicer=slicer,
1550+
na_rep=self.na_rep,
15471551
float_format=self.float_format,
15481552
decimal=self.decimal,
1549-
date_format=self.date_format)
1553+
date_format=self.date_format,
1554+
quoting=self.quoting)
15501555

15511556
for col_loc, col in zip(b.mgr_locs, d):
15521557
# self.data is a preallocated list
15531558
self.data[col_loc] = col
15541559

1555-
ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep,
1560+
ix = data_index.to_native_types(slicer=slicer,
1561+
na_rep=self.na_rep,
15561562
float_format=self.float_format,
1557-
date_format=self.date_format)
1563+
date_format=self.date_format,
1564+
quoting=self.quoting)
15581565

15591566
lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer)
15601567

@@ -2030,16 +2037,43 @@ def __init__(self, values, nat_rep='NaT', date_format=None, **kwargs):
20302037
self.date_format = date_format
20312038

20322039
def _format_strings(self):
2033-
formatter = (self.formatter or
2034-
_get_format_datetime64_from_values(self.values,
2035-
nat_rep=self.nat_rep,
2036-
date_format=self.date_format))
20372040

2038-
fmt_values = [formatter(x) for x in self.values]
2041+
# we may have a tz, if so, then need to process element-by-element
2042+
# when DatetimeBlockWithTimezones is a reality this could be fixed
2043+
values = self.values
2044+
if not isinstance(values, DatetimeIndex):
2045+
values = DatetimeIndex(values)
2046+
2047+
if values.tz is None:
2048+
fmt_values = format_array_from_datetime(values.asi8.ravel(),
2049+
format=_get_format_datetime64_from_values(values, self.date_format),
2050+
na_rep=self.nat_rep).reshape(values.shape)
2051+
fmt_values = fmt_values.tolist()
2052+
2053+
else:
2054+
2055+
values = values.asobject
2056+
is_dates_only = _is_dates_only(values)
2057+
formatter = (self.formatter or _get_format_datetime64(is_dates_only, values, date_format=self.date_format))
2058+
fmt_values = [ formatter(x) for x in self.values ]
20392059

20402060
return fmt_values
20412061

20422062

2063+
def _is_dates_only(values):
2064+
# return a boolean if we are only dates (and don't have a timezone)
2065+
values = DatetimeIndex(values)
2066+
if values.tz is not None:
2067+
return False
2068+
2069+
values_int = values.asi8
2070+
consider_values = values_int != iNaT
2071+
one_day_nanos = (86400 * 1e9)
2072+
even_days = np.logical_and(consider_values, values_int % one_day_nanos != 0).sum() == 0
2073+
if even_days:
2074+
return True
2075+
return False
2076+
20432077
def _format_datetime64(x, tz=None, nat_rep='NaT'):
20442078
if x is None or lib.checknull(x):
20452079
return nat_rep
@@ -2062,22 +2096,6 @@ def _format_datetime64_dateonly(x, nat_rep='NaT', date_format=None):
20622096
else:
20632097
return x._date_repr
20642098

2065-
2066-
def _is_dates_only(values):
2067-
# return a boolean if we are only dates (and don't have a timezone)
2068-
from pandas import DatetimeIndex
2069-
values = DatetimeIndex(values)
2070-
if values.tz is not None:
2071-
return False
2072-
2073-
values_int = values.asi8
2074-
consider_values = values_int != iNaT
2075-
one_day_nanos = (86400 * 1e9)
2076-
even_days = np.logical_and(consider_values, values_int % one_day_nanos != 0).sum() == 0
2077-
if even_days:
2078-
return True
2079-
return False
2080-
20812099
def _get_format_datetime64(is_dates_only, nat_rep='NaT', date_format=None):
20822100

20832101
if is_dates_only:
@@ -2088,13 +2106,12 @@ def _get_format_datetime64(is_dates_only, nat_rep='NaT', date_format=None):
20882106
return lambda x, tz=None: _format_datetime64(x, tz=tz, nat_rep=nat_rep)
20892107

20902108

2091-
def _get_format_datetime64_from_values(values,
2092-
nat_rep='NaT',
2093-
date_format=None):
2109+
def _get_format_datetime64_from_values(values, date_format):
2110+
""" given values and a date_format, return a string format """
20942111
is_dates_only = _is_dates_only(values)
2095-
return _get_format_datetime64(is_dates_only=is_dates_only,
2096-
nat_rep=nat_rep,
2097-
date_format=date_format)
2112+
if is_dates_only:
2113+
return date_format or "%Y-%m-%d"
2114+
return None
20982115

20992116

21002117
class Timedelta64Formatter(GenericArrayFormatter):

pandas/core/index.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1071,12 +1071,16 @@ def to_native_types(self, slicer=None, **kwargs):
10711071
values = values[slicer]
10721072
return values._format_native_types(**kwargs)
10731073

1074-
def _format_native_types(self, na_rep='', **kwargs):
1074+
def _format_native_types(self, na_rep='', quoting=None, **kwargs):
10751075
""" actually format my specific types """
10761076
mask = isnull(self)
1077-
values = np.array(self, dtype=object, copy=True)
1077+
if not self.is_object() and not quoting:
1078+
values = np.asarray(self).astype(str)
1079+
else:
1080+
values = np.array(self, dtype=object, copy=True)
1081+
10781082
values[mask] = na_rep
1079-
return values.tolist()
1083+
return values
10801084

10811085
def equals(self, other):
10821086
"""
@@ -3298,7 +3302,7 @@ def _reference_duplicate_name(self, name):
32983302
return np.sum(name == np.asarray(self.names)) > 1
32993303

33003304
def _format_native_types(self, **kwargs):
3301-
return self.tolist()
3305+
return self.values
33023306

33033307
@property
33043308
def _constructor(self):

pandas/core/internals.py

Lines changed: 31 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -484,16 +484,21 @@ def _try_coerce_and_cast_result(self, result, dtype=None):
484484
def _try_fill(self, value):
485485
return value
486486

487-
def to_native_types(self, slicer=None, na_rep='', **kwargs):
487+
def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs):
488488
""" convert to our native types format, slicing if desired """
489489

490490
values = self.values
491491
if slicer is not None:
492492
values = values[:, slicer]
493-
values = np.array(values, dtype=object)
494493
mask = isnull(values)
494+
495+
if not self.is_object and not quoting:
496+
values = values.astype(str)
497+
else:
498+
values = np.array(values, dtype='object')
499+
495500
values[mask] = na_rep
496-
return values.tolist()
501+
return values
497502

498503
# block actions ####
499504
def copy(self, deep=True):
@@ -1221,32 +1226,34 @@ def _try_cast(self, element):
12211226
return element
12221227

12231228
def to_native_types(self, slicer=None, na_rep='', float_format=None, decimal='.',
1224-
**kwargs):
1229+
quoting=None, **kwargs):
12251230
""" convert to our native types format, slicing if desired """
12261231

12271232
values = self.values
12281233
if slicer is not None:
12291234
values = values[:, slicer]
1230-
values = np.array(values, dtype=object)
12311235
mask = isnull(values)
1232-
values[mask] = na_rep
1233-
12341236

1237+
formatter = None
12351238
if float_format and decimal != '.':
12361239
formatter = lambda v : (float_format % v).replace('.',decimal,1)
12371240
elif decimal != '.':
12381241
formatter = lambda v : ('%g' % v).replace('.',decimal,1)
12391242
elif float_format:
12401243
formatter = lambda v : float_format % v
1244+
1245+
if formatter is None and not quoting:
1246+
values = values.astype(str)
12411247
else:
1242-
formatter = None
1248+
values = np.array(values, dtype='object')
12431249

1250+
values[mask] = na_rep
12441251
if formatter:
12451252
imask = (~mask).ravel()
12461253
values.flat[imask] = np.array(
12471254
[formatter(val) for val in values.ravel()[imask]])
12481255

1249-
return values.tolist()
1256+
return values
12501257

12511258
def should_store(self, value):
12521259
# when inserting a column should not coerce integers to floats
@@ -1366,7 +1373,7 @@ def _try_coerce_result(self, result):
13661373
def should_store(self, value):
13671374
return issubclass(value.dtype.type, np.timedelta64)
13681375

1369-
def to_native_types(self, slicer=None, na_rep=None, **kwargs):
1376+
def to_native_types(self, slicer=None, na_rep=None, quoting=None, **kwargs):
13701377
""" convert to our native types format, slicing if desired """
13711378

13721379
values = self.values
@@ -1387,7 +1394,7 @@ def to_native_types(self, slicer=None, na_rep=None, **kwargs):
13871394
rvalues.flat[imask] = np.array([Timedelta(val)._repr_base(format='all')
13881395
for val in values.ravel()[imask]],
13891396
dtype=object)
1390-
return rvalues.tolist()
1397+
return rvalues
13911398

13921399

13931400
def get_values(self, dtype=None):
@@ -1763,18 +1770,19 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
17631770
ndim=self.ndim,
17641771
placement=self.mgr_locs)
17651772

1766-
def to_native_types(self, slicer=None, na_rep='', **kwargs):
1773+
def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs):
17671774
""" convert to our native types format, slicing if desired """
17681775

17691776
values = self.values
17701777
if slicer is not None:
17711778
# Categorical is always one dimension
17721779
values = values[slicer]
1773-
values = np.array(values, dtype=object)
17741780
mask = isnull(values)
1781+
values = np.array(values, dtype='object')
17751782
values[mask] = na_rep
1776-
# Blocks.to_native_type returns list of lists, but we are always only a list
1777-
return [values.tolist()]
1783+
1784+
# we are expected to return a 2-d ndarray
1785+
return values.reshape(1,len(values))
17781786

17791787
class DatetimeBlock(Block):
17801788
__slots__ = ()
@@ -1864,29 +1872,21 @@ def fillna(self, value, limit=None,
18641872
fastpath=True, placement=self.mgr_locs)]
18651873

18661874
def to_native_types(self, slicer=None, na_rep=None, date_format=None,
1867-
**kwargs):
1875+
quoting=None, **kwargs):
18681876
""" convert to our native types format, slicing if desired """
18691877

18701878
values = self.values
18711879
if slicer is not None:
18721880
values = values[:, slicer]
1873-
mask = isnull(values)
1874-
1875-
rvalues = np.empty(values.shape, dtype=object)
1876-
if na_rep is None:
1877-
na_rep = 'NaT'
1878-
rvalues[mask] = na_rep
1879-
imask = (~mask).ravel()
18801881

1881-
if date_format is None:
1882-
date_formatter = lambda x: Timestamp(x)._repr_base
1883-
else:
1884-
date_formatter = lambda x: Timestamp(x).strftime(date_format)
1882+
from pandas.core.format import _get_format_datetime64_from_values
1883+
format = _get_format_datetime64_from_values(values, date_format)
18851884

1886-
rvalues.flat[imask] = np.array([date_formatter(val) for val in
1887-
values.ravel()[imask]], dtype=object)
1888-
1889-
return rvalues.tolist()
1885+
result = tslib.format_array_from_datetime(values.view('i8').ravel(),
1886+
tz=None,
1887+
format=format,
1888+
na_rep=na_rep).reshape(values.shape)
1889+
return result
18901890

18911891
def should_store(self, value):
18921892
return issubclass(value.dtype.type, np.datetime64)

pandas/lib.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -933,7 +933,7 @@ def string_array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_re
933933

934934
@cython.boundscheck(False)
935935
@cython.wraparound(False)
936-
def write_csv_rows(list data, list data_index, int nlevels, list cols, object writer):
936+
def write_csv_rows(list data, ndarray data_index, int nlevels, ndarray cols, object writer):
937937

938938
cdef int N, j, i, ncols
939939
cdef list rows

pandas/tests/test_format.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3010,12 +3010,12 @@ def test_format(self):
30103010

30113011
def test_output_significant_digits(self):
30123012
# Issue #9764
3013-
3013+
30143014
# In case default display precision changes:
30153015
with pd.option_context('display.precision', 7):
30163016
# DataFrame example from issue #9764
30173017
d=pd.DataFrame({'col1':[9.999e-8, 1e-7, 1.0001e-7, 2e-7, 4.999e-7, 5e-7, 5.0001e-7, 6e-7, 9.999e-7, 1e-6, 1.0001e-6, 2e-6, 4.999e-6, 5e-6, 5.0001e-6, 6e-6]})
3018-
3018+
30193019
expected_output={
30203020
(0,6):' col1\n0 9.999000e-08\n1 1.000000e-07\n2 1.000100e-07\n3 2.000000e-07\n4 4.999000e-07\n5 5.000000e-07',
30213021
(1,6):' col1\n1 1.000000e-07\n2 1.000100e-07\n3 2.000000e-07\n4 4.999000e-07\n5 5.000000e-07',

pandas/tseries/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def groupby(self, f):
6161
return _algos.groupby_object(objs, f)
6262

6363
def _format_with_header(self, header, **kwargs):
64-
return header + self._format_native_types(**kwargs)
64+
return header + list(self._format_native_types(**kwargs))
6565

6666
def __contains__(self, key):
6767
try:

0 commit comments

Comments
 (0)