Description
-
I have checked that this issue has not already been reported (related, but different).
-
I have confirmed this bug exists on the latest version of pandas.
-
(optional) I have confirmed this bug exists on the master branch of pandas.
Code Sample, a copy-pastable example
import numpy as np
import pandas as pd
# Taken from the example here: https://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#sparsedtype
dtype = pd.SparseDtype(np.dtype('datetime64[ns]'))
# Some value
values = [np.datetime64('2012-05-01T01:00:00.000000'), np.datetime64('2016-05-01T01:00:00.000000')]
# Create the series
series = pd.Series(values, dtype=dtype)
Alternatively:
series = pd.Series(values, dtype="Sparse[datetime64[ns]]")
Problem description
As a user I would expect that datetime64[ns]
is supported as SparseDtype for the SparseArray based on the Sparse data structures page in the documentation. This is desireable for the same rationale as supporting other sparse types, since date(time)s can also contain mostly NaT values that the users wants to store efficiently.
Running the code above yields the following TypeError:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-52-9fc6a02721e3> in <module>
----> 1 series = pd.Series(values, dtype=dtype)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in __init__(self, data, index, dtype, name, copy, fastpath)
325 data = data.copy()
326 else:
--> 327 data = sanitize_array(data, index, dtype, copy, raise_cast_failure=True)
328
329 data = SingleBlockManager.from_array(data, index)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\construction.py in sanitize_array(data, index, dtype, copy, raise_cast_failure)
439 elif isinstance(data, (list, tuple)) and len(data) > 0:
440 if dtype is not None:
--> 441 subarr = _try_cast(data, dtype, copy, raise_cast_failure)
442 else:
443 subarr = maybe_convert_platform(data)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\construction.py in _try_cast(arr, dtype, copy, raise_cast_failure)
551 subarr = arr
552 else:
--> 553 subarr = maybe_cast_to_datetime(arr, dtype)
554
555 # Take care in creating object arrays (but iterators are not
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in maybe_cast_to_datetime(value, dtype, errors)
1327 # pandas supports dtype whose granularity is less than [ns]
1328 # e.g., [ps], [fs], [as]
-> 1329 if dtype <= np.dtype("M8[ns]"):
1330 if dtype.name == "datetime64":
1331 raise ValueError(msg)
TypeError: data type not understood
Expected Output
The expected output is for Sparse[datetime[ns]]
similar to what one would expect from other dtypes:
>>> series.values
[2012-05-01 01:00:00, 2018-05-01 01:00:00]
Fill: NaT
IntIndex
Indices: array([0, 1])
>>> series.dtype
Sparse[datetime64[ns], NaT]
Workaround
Note that the following workaround partially gives the expected results:
series = pd.Series(pd.arrays.SparseArray(values))
For which these operations work as expected:
>>> series.values
[2012-05-01 01:00:00, 2018-05-01 01:00:00]
Fill: NaT
IntIndex
Indices: array([0, 1])
>>> series.dtype
Sparse[datetime64[ns], NaT]
However series.head()
will throw an error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\formatters.py in __call__(self, obj)
700 type_pprinters=self.type_printers,
701 deferred_pprinters=self.deferred_printers)
--> 702 printer.pretty(obj)
703 printer.flush()
704 return stream.getvalue()
C:\ProgramData\Anaconda3\lib\site-packages\IPython\lib\pretty.py in pretty(self, obj)
400 if cls is not object \
401 and callable(cls.__dict__.get('__repr__')):
--> 402 return _repr_pprint(obj, self, cycle)
403
404 return _default_pprint(obj, self, cycle)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\lib\pretty.py in _repr_pprint(obj, p, cycle)
695 """A pprint that just redirects to the normal repr function."""
696 # Find newlines and replace them with p.break_()
--> 697 output = repr(obj)
698 for idx,output_line in enumerate(output.splitlines()):
699 if idx:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in __repr__(self)
1319 min_rows=min_rows,
1320 max_rows=max_rows,
-> 1321 length=show_dimensions,
1322 )
1323 result = buf.getvalue()
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in to_string(self, buf, na_rep, float_format, header, index, length, dtype, name, max_rows, min_rows)
1384 max_rows=max_rows,
1385 )
-> 1386 result = formatter.to_string()
1387
1388 # catch contract violations
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\formats\format.py in to_string(self)
356
357 fmt_index, have_header = self._get_formatted_index()
--> 358 fmt_values = self._get_formatted_values()
359
360 if self.truncate_v:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\formats\format.py in _get_formatted_values(self)
345 None,
346 float_format=self.float_format,
--> 347 na_rep=self.na_rep,
348 )
349
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\formats\format.py in format_array(values, formatter, float_format, na_rep, digits, space, justify, decimal, leading_space, quoting)
1177 )
1178
-> 1179 return fmt_obj.get_result()
1180
1181
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\formats\format.py in get_result(self)
1208
1209 def get_result(self) -> List[str]:
-> 1210 fmt_values = self._format_strings()
1211 return _make_fixed_width(fmt_values, self.justify)
1212
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\formats\format.py in _format_strings(self)
1465
1466 if not isinstance(values, DatetimeIndex):
-> 1467 values = DatetimeIndex(values)
1468
1469 if self.formatter is not None and callable(self.formatter):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\datetimes.py in __new__(cls, data, freq, tz, normalize, closed, ambiguous, dayfirst, yearfirst, dtype, copy, name)
277 dayfirst=dayfirst,
278 yearfirst=yearfirst,
--> 279 ambiguous=ambiguous,
280 )
281
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arrays\datetimes.py in _from_sequence(cls, data, dtype, copy, tz, freq, dayfirst, yearfirst, ambiguous)
321 dayfirst=dayfirst,
322 yearfirst=yearfirst,
--> 323 ambiguous=ambiguous,
324 )
325
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arrays\datetimes.py in sequence_to_dt64ns(data, dtype, copy, tz, dayfirst, yearfirst, ambiguous)
1957 if is_datetime64tz_dtype(data_dtype):
1958 # DatetimeArray -> ndarray
-> 1959 tz = _maybe_infer_tz(tz, data.tz)
1960 result = data._data
1961
AttributeError: 'SparseArray' object has no attribute 'tz'
Expected:
0 2012-05-01 01:00:00
1 2018-05-01 01:00:00
dtype: Sparse[datetime64[ns]]
Output of pd.show_versions()
INSTALLED VERSIONS
commit : d9fff27
python : 3.7.3.final.0
python-bits : 64
OS : Windows
OS-release : 10
Version : 10.0.18362
machine : AMD64
processor : Intel64 Family 6 Model 158 Stepping 10, GenuineIntel
byteorder : little
LC_ALL : None
LANG : None
LOCALE : None.None
pandas : 1.1.0
numpy : 1.19.1
pytz : 2019.3
dateutil : 2.8.0
pip : 19.3.1
setuptools : 41.4.0
Cython : 0.29.13
pytest : 5.4.1
hypothesis : None
sphinx : 2.2.0
blosc : None
feather : None
xlsxwriter : 1.2.2
lxml.etree : 4.4.1
html5lib : 1.0.1
pymysql : None
psycopg2 : None
jinja2 : 2.11.2
IPython : 7.8.0
pandas_datareader: None
bs4 : 4.8.1
bottleneck : 1.2.1
fsspec : 0.5.2
fastparquet : None
gcsfs : None
matplotlib : 3.2.1
numexpr : 2.7.0
odfpy : None
openpyxl : 3.0.0
pandas_gbq : None
pyarrow : 0.14.0
pytables : None
pyxlsb : None
s3fs : None
scipy : 1.3.1
sqlalchemy : 1.3.10
tables : 3.6.0
tabulate : 0.8.6
xarray : None
xlrd : 1.2.0
xlwt : 1.3.0
numba : 0.45.1