Skip to content

Commit 68ba602

Browse files
committed
ENH: ujson native datetime serialisation
1 parent 359017f commit 68ba602

File tree

11 files changed

+514
-306
lines changed

11 files changed

+514
-306
lines changed

doc/source/io.rst

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1107,8 +1107,11 @@ is ``None``. To explicity force ``Series`` parsing, pass ``typ=series``
11071107
- ``keep_default_dates`` : boolean, default True. If parsing dates, then parse the default datelike columns
11081108
- ``numpy`` : direct decoding to numpy arrays. default is False;
11091109
Note that the JSON ordering **MUST** be the same for each term if ``numpy=True``
1110-
- ``precise_float`` : boolean, default ``False``. Set to enable usage of higher precision (strtod) function
1111-
when decoding string to double values. Default (``False``) is to use fast but less precise builtin functionality
1110+
- ``precise_float`` : boolean, default ``False``. Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (``False``) is to use fast but less precise builtin functionality
1111+
- ``date_unit`` : string, the timestamp unit to detect if converting dates. Default
1112+
None. By default the timestamp precision will be detected, if this is not desired
1113+
then pass one of 's', 'ms', 'us' or 'ns' to force timestamp precision to
1114+
seconds, milliseconds, microseconds or nanoseconds respectively.
11121115

11131116
The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is
11141117
not parsable.
@@ -1168,6 +1171,25 @@ I like my string indicies
11681171
sij.index
11691172
sij.columns
11701173
1174+
My dates have been written in nanoseconds, so they need to be read back in
1175+
nanoseconds
1176+
1177+
.. ipython:: python
1178+
1179+
json = dfj2.to_json(date_unit='ns')
1180+
1181+
# Try to parse timestamps as millseconds -> Won't Work
1182+
dfju = pd.read_json(json, date_unit='ms')
1183+
dfju
1184+
1185+
# Let Pandas detect the correct precision
1186+
dfju = pd.read_json(json)
1187+
dfju
1188+
1189+
# Or specify that all timestamps are in nanoseconds
1190+
dfju = pd.read_json(json, date_unit='ns')
1191+
dfju
1192+
11711193
.. ipython:: python
11721194
:suppress:
11731195

doc/source/release.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,9 @@ pandas 0.13
134134
local variable was undefined (:issue:`4381`)
135135
- In ``to_json``, raise if a passed ``orient`` would cause loss of data because
136136
of a duplicate index (:issue:`4359`)
137+
- In ``to_json``, fix date handling so milliseconds are the default timestamp
138+
as the docstring says (:issue:`4362`).
139+
- JSON NaT handling fixed, NaTs are now serialised to `null` (:issue:`4498`)
137140
- Fixed passing ``keep_default_na=False`` when ``na_values=None`` (:issue:`4318`)
138141
- Fixed bug with ``values`` raising an error on a DataFrame with duplicate columns and mixed
139142
dtypes, surfaced in (:issue:`4377`)

pandas/core/generic.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -535,7 +535,7 @@ def to_clipboard(self):
535535
clipboard.to_clipboard(self)
536536

537537
def to_json(self, path_or_buf=None, orient=None, date_format='epoch',
538-
double_precision=10, force_ascii=True):
538+
double_precision=10, force_ascii=True, date_unit='ms'):
539539
"""
540540
Convert the object to a JSON string.
541541
@@ -566,11 +566,15 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch',
566566
- columns : dict like {column -> {index -> value}}
567567
- values : just the values array
568568
569-
date_format : type of date conversion (epoch = epoch milliseconds, iso = ISO8601)
570-
default is epoch
569+
date_format : string, default 'epoch'
570+
type of date conversion, 'epoch' for timestamp, 'iso' for ISO8601
571571
double_precision : The number of decimal places to use when encoding
572572
floating point values, default 10.
573573
force_ascii : force encoded string to be ASCII, default True.
574+
date_unit : string, default 'ms' (milliseconds)
575+
The time unit to encode to, governs timestamp and ISO8601
576+
precision. One of 's', 'ms', 'us', 'ns' for second, millisecond,
577+
microsecond, and nanosecond respectively.
574578
575579
Returns
576580
-------
@@ -580,8 +584,13 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch',
580584
"""
581585

582586
from pandas.io import json
583-
return json.to_json(path_or_buf=path_or_buf, obj=self, orient=orient, date_format=date_format,
584-
double_precision=double_precision, force_ascii=force_ascii)
587+
return json.to_json(
588+
path_or_buf=path_or_buf,
589+
obj=self, orient=orient,
590+
date_format=date_format,
591+
double_precision=double_precision,
592+
force_ascii=force_ascii,
593+
date_unit=date_unit)
585594

586595
# install the indexerse
587596
for _name, _indexer in indexing.get_indexers_list():

pandas/io/json.py

Lines changed: 84 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11

22
# pylint: disable-msg=E1101,W0613,W0603
3-
from pandas.compat import StringIO, long
4-
from pandas import compat
5-
import os
3+
from pandas.compat import long
64

5+
from pandas import compat, isnull
76
from pandas import Series, DataFrame, to_datetime
87
from pandas.io.common import get_filepath_or_buffer
98
import pandas.json as _json
@@ -12,32 +11,39 @@
1211

1312
import numpy as np
1413
from pandas.tslib import iNaT
15-
import pandas.lib as lib
1614

1715
### interface to/from ###
1816

19-
def to_json(path_or_buf, obj, orient=None, date_format='epoch', double_precision=10, force_ascii=True):
17+
18+
def to_json(path_or_buf, obj, orient=None, date_format='epoch',
19+
double_precision=10, force_ascii=True, date_unit='ms'):
2020

2121
if isinstance(obj, Series):
22-
s = SeriesWriter(obj, orient=orient, date_format=date_format, double_precision=double_precision,
23-
ensure_ascii=force_ascii).write()
22+
s = SeriesWriter(
23+
obj, orient=orient, date_format=date_format,
24+
double_precision=double_precision, ensure_ascii=force_ascii,
25+
date_unit=date_unit).write()
2426
elif isinstance(obj, DataFrame):
25-
s = FrameWriter(obj, orient=orient, date_format=date_format, double_precision=double_precision,
26-
ensure_ascii=force_ascii).write()
27+
s = FrameWriter(
28+
obj, orient=orient, date_format=date_format,
29+
double_precision=double_precision, ensure_ascii=force_ascii,
30+
date_unit=date_unit).write()
2731
else:
2832
raise NotImplementedError
2933

3034
if isinstance(path_or_buf, compat.string_types):
31-
with open(path_or_buf,'w') as fh:
35+
with open(path_or_buf, 'w') as fh:
3236
fh.write(s)
3337
elif path_or_buf is None:
3438
return s
3539
else:
3640
path_or_buf.write(s)
3741

42+
3843
class Writer(object):
3944

40-
def __init__(self, obj, orient, date_format, double_precision, ensure_ascii):
45+
def __init__(self, obj, orient, date_format, double_precision,
46+
ensure_ascii, date_unit):
4147
self.obj = obj
4248

4349
if orient is None:
@@ -47,38 +53,23 @@ def __init__(self, obj, orient, date_format, double_precision, ensure_ascii):
4753
self.date_format = date_format
4854
self.double_precision = double_precision
4955
self.ensure_ascii = ensure_ascii
56+
self.date_unit = date_unit
5057

5158
self.is_copy = False
5259
self._format_axes()
53-
self._format_dates()
54-
55-
def _needs_to_date(self, obj):
56-
return obj.dtype == 'datetime64[ns]'
57-
58-
def _format_dates(self):
59-
raise NotImplementedError
6060

6161
def _format_axes(self):
6262
raise NotImplementedError
6363

64-
def _format_to_date(self, data):
65-
66-
# iso
67-
if self.date_format == 'iso':
68-
return data.apply(lambda x: x.isoformat())
69-
70-
# int64
71-
else:
72-
return data.astype(np.int64)
73-
74-
def copy_if_needed(self):
75-
""" copy myself if necessary """
76-
if not self.is_copy:
77-
self.obj = self.obj.copy()
78-
self.is_copy = True
79-
8064
def write(self):
81-
return dumps(self.obj, orient=self.orient, double_precision=self.double_precision, ensure_ascii=self.ensure_ascii)
65+
return dumps(
66+
self.obj,
67+
orient=self.orient,
68+
double_precision=self.double_precision,
69+
ensure_ascii=self.ensure_ascii,
70+
date_unit=self.date_unit,
71+
iso_dates=self.date_format == 'iso')
72+
8273

8374
class SeriesWriter(Writer):
8475
_default_orient = 'index'
@@ -87,17 +78,7 @@ def _format_axes(self):
8778
if not self.obj.index.is_unique and self.orient == 'index':
8879
raise ValueError("Series index must be unique for orient="
8980
"'%s'" % self.orient)
90-
if self._needs_to_date(self.obj.index):
91-
self.copy_if_needed()
92-
self.obj.index = self._format_to_date(self.obj.index.to_series())
9381

94-
def _format_dates(self):
95-
if self.obj.dtype == 'datetime64[ns]':
96-
self.obj = self._format_to_date(self.obj)
97-
98-
def _format_bools(self):
99-
if self._needs_to_bool(self.obj):
100-
self.obj = self._format_to_bool(self.obj)
10182

10283
class FrameWriter(Writer):
10384
_default_orient = 'columns'
@@ -113,39 +94,10 @@ def _format_axes(self):
11394
raise ValueError("DataFrame columns must be unique for orient="
11495
"'%s'." % self.orient)
11596

116-
if self.orient == 'columns':
117-
axis = 'index'
118-
elif self.orient == 'index':
119-
axis = 'columns'
120-
else:
121-
return
122-
123-
a = getattr(self.obj,axis)
124-
if self._needs_to_date(a):
125-
self.copy_if_needed()
126-
setattr(self.obj,axis,self._format_to_date(a.to_series()))
127-
128-
def _format_dates(self):
129-
dtypes = self.obj.dtypes
130-
if len(dtypes[dtypes == 'datetime64[ns]']):
131-
132-
# need to create a new object
133-
d = {}
134-
135-
for i, (col, c) in enumerate(self.obj.iteritems()):
136-
137-
if c.dtype == 'datetime64[ns]':
138-
c = self._format_to_date(c)
139-
140-
d[i] = c
141-
142-
d = DataFrame(d,index=self.obj.index)
143-
d.columns = self.obj.columns
144-
self.obj = d
14597

14698
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
14799
convert_axes=True, convert_dates=True, keep_default_dates=True,
148-
numpy=False, precise_float=False):
100+
numpy=False, precise_float=False, date_unit=None):
149101
"""
150102
Convert JSON string to pandas object
151103
@@ -176,18 +128,28 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
176128
values : just the values array
177129
178130
typ : type of object to recover (series or frame), default 'frame'
179-
dtype : if True, infer dtypes, if a dict of column to dtype, then use those,
180-
if False, then don't infer dtypes at all, default is True,
181-
apply only to the data
182-
convert_axes : boolean, try to convert the axes to the proper dtypes, default is True
183-
convert_dates : a list of columns to parse for dates; If True, then try to parse datelike columns
184-
default is True
185-
keep_default_dates : boolean, default True. If parsing dates,
186-
then parse the default datelike columns
187-
numpy : direct decoding to numpy arrays. default is False.Note that the JSON ordering MUST be the same
188-
for each term if numpy=True.
189-
precise_float : boolean, default False. Set to enable usage of higher precision (strtod) function
190-
when decoding string to double values. Default (False) is to use fast but less precise builtin functionality
131+
dtype : boolean or dict, default True
132+
If True, infer dtypes, if a dict of column to dtype, then use those,
133+
if False, then don't infer dtypes at all, applies only to the data.
134+
convert_axes : boolean, default True
135+
Try to convert the axes to the proper dtypes.
136+
convert_dates : boolean, default True
137+
List of columns to parse for dates; If True, then try to parse
138+
datelike columns default is True
139+
keep_default_dates : boolean, default True.
140+
If parsing dates, then parse the default datelike columns
141+
numpy : boolean, default False
142+
Direct decoding to numpy arrays. Note that the JSON ordering MUST be
143+
the same for each term if numpy=True.
144+
precise_float : boolean, default False.
145+
Set to enable usage of higher precision (strtod) function when
146+
decoding string to double values. Default (False) is to use fast but
147+
less precise builtin functionality
148+
date_unit : string, default None
149+
The timestamp unit to detect if converting dates. The default behaviour
150+
is to try and detect the correct precision, but if this is not desired
151+
then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,
152+
milliseconds, microseconds or nanoseconds respectively.
191153
192154
Returns
193155
-------
@@ -208,20 +170,28 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
208170

209171
obj = None
210172
if typ == 'frame':
211-
obj = FrameParser(json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy).parse()
173+
obj = FrameParser(json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit).parse()
212174

213175
if typ == 'series' or obj is None:
214176
if not isinstance(dtype,bool):
215177
dtype = dict(data = dtype)
216-
obj = SeriesParser(json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy).parse()
178+
obj = SeriesParser(json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit).parse()
217179

218180
return obj
219181

182+
220183
class Parser(object):
221184

185+
_STAMP_UNITS = ('s', 'ms', 'us', 'ns')
186+
_MIN_STAMPS = {
187+
's': long(31536000),
188+
'ms': long(31536000000),
189+
'us': long(31536000000000),
190+
'ns': long(31536000000000000)}
191+
222192
def __init__(self, json, orient, dtype=True, convert_axes=True,
223193
convert_dates=True, keep_default_dates=False, numpy=False,
224-
precise_float=False):
194+
precise_float=False, date_unit=None):
225195
self.json = json
226196

227197
if orient is None:
@@ -233,10 +203,20 @@ def __init__(self, json, orient, dtype=True, convert_axes=True,
233203
if orient == "split":
234204
numpy = False
235205

206+
if date_unit is not None:
207+
date_unit = date_unit.lower()
208+
if date_unit not in self._STAMP_UNITS:
209+
raise ValueError('date_unit must be one of %s' %
210+
(self._STAMP_UNITS,))
211+
self.min_stamp = self._MIN_STAMPS[date_unit]
212+
else:
213+
self.min_stamp = self._MIN_STAMPS['s']
214+
236215
self.numpy = numpy
237216
self.precise_float = precise_float
238-
self.convert_axes = convert_axes
217+
self.convert_axes = convert_axes
239218
self.convert_dates = convert_dates
219+
self.date_unit = date_unit
240220
self.keep_default_dates = keep_default_dates
241221
self.obj = None
242222

@@ -356,21 +336,23 @@ def _try_convert_to_date(self, data):
356336

357337

358338
# ignore numbers that are out of range
359-
if issubclass(new_data.dtype.type,np.number):
360-
if not ((new_data == iNaT) | (new_data > long(31536000000000000))).all():
339+
if issubclass(new_data.dtype.type, np.number):
340+
in_range = (isnull(new_data.values) | (new_data > self.min_stamp) |
341+
(new_data.values == iNaT))
342+
if not in_range.all():
361343
return data, False
362344

363-
try:
364-
new_data = to_datetime(new_data)
365-
except:
345+
date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS
346+
for date_unit in date_units:
366347
try:
367-
new_data = to_datetime(new_data.astype('int64'))
348+
new_data = to_datetime(new_data, errors='raise',
349+
unit=date_unit)
350+
except OverflowError:
351+
continue
368352
except:
369-
370-
# return old, noting more we can do
371-
return data, False
372-
373-
return new_data, True
353+
break
354+
return new_data, True
355+
return data, False
374356

375357
def _try_convert_dates(self):
376358
raise NotImplementedError
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"A":{"2000-01-03T00:00:00":1.56808523,"2000-01-04T00:00:00":-0.2550111,"2000-01-05T00:00:00":1.51493992,"2000-01-06T00:00:00":-0.02765498,"2000-01-07T00:00:00":0.05951614},"B":{"2000-01-03T00:00:00":0.65727391,"2000-01-04T00:00:00":-0.08072427,"2000-01-05T00:00:00":0.11805825,"2000-01-06T00:00:00":0.44679743,"2000-01-07T00:00:00":-2.69652057},"C":{"2000-01-03T00:00:00":1.81021139,"2000-01-04T00:00:00":-0.03202878,"2000-01-05T00:00:00":1.629455,"2000-01-06T00:00:00":0.33192641,"2000-01-07T00:00:00":1.28163262},"D":{"2000-01-03T00:00:00":-0.17251653,"2000-01-04T00:00:00":-0.17581665,"2000-01-05T00:00:00":-1.31506612,"2000-01-06T00:00:00":-0.27885413,"2000-01-07T00:00:00":0.34703478},"date":{"2000-01-03T00:00:00":"1992-01-06T18:21:32.120000","2000-01-04T00:00:00":"1992-01-06T18:21:32.120000","2000-01-05T00:00:00":"1992-01-06T18:21:32.120000","2000-01-06T00:00:00":"2013-01-01T00:00:00","2000-01-07T00:00:00":"1992-01-06T18:21:32.120000"}}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"A":{"946857600000000000":1.56808523,"946944000000000000":-0.2550111,"947030400000000000":1.51493992,"947116800000000000":-0.02765498,"947203200000000000":0.05951614},"B":{"946857600000000000":0.65727391,"946944000000000000":-0.08072427,"947030400000000000":0.11805825,"947116800000000000":0.44679743,"947203200000000000":-2.69652057},"C":{"946857600000000000":1.81021139,"946944000000000000":-0.03202878,"947030400000000000":1.629455,"947116800000000000":0.33192641,"947203200000000000":1.28163262},"D":{"946857600000000000":-0.17251653,"946944000000000000":-0.17581665,"947030400000000000":-1.31506612,"947116800000000000":-0.27885413,"947203200000000000":0.34703478},"date":{"946857600000000000":694722092120000000,"946944000000000000":694722092120000000,"947030400000000000":694722092120000000,"947116800000000000":1356998400000000000,"947203200000000000":694722092120000000},"modified":{"946857600000000000":694722092120000000,"946944000000000000":null,"947030400000000000":694722092120000000,"947116800000000000":1356998400000000000,"947203200000000000":694722092120000000}}

0 commit comments

Comments
 (0)