Skip to content

Commit 73f6565

Browse files
committed
ENH: Enable automatic writing of dates to Stata files
Automatically select type %tc for datetime[ns] columns Change ValueErrors to NotImplementedError for unsupported types Add tests for select exceptions Improve to_stata and StataWriter docstrings
1 parent 8acfad3 commit 73f6565

File tree

4 files changed

+124
-48
lines changed

4 files changed

+124
-48
lines changed

doc/source/whatsnew/v0.19.0.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,7 @@ Other enhancements
251251
- ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`)
252252
- ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`)
253253
- ``.to_stata()`` and ```StataWriter`` can now write variable labels to Stata dta files using a dictionary to make column names to labels (:issue:`13535`, :issue:`13536`)
254+
- ``.to_stata()`` and ```StataWriter`` will automatically convert ``datetime[ns]`` columns to Stata format ``%tc`` rather than raising a ``ValueError`` (:issue:`12259`)
254255

255256
.. _whatsnew_0190.api:
256257

pandas/core/frame.py

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1473,32 +1473,44 @@ def to_stata(self, fname, convert_dates=None, write_index=True,
14731473
14741474
Parameters
14751475
----------
1476-
fname : file path or buffer
1477-
Where to save the dta file.
1476+
fname : str or buffer
1477+
String path of file-like object
14781478
convert_dates : dict
1479-
Dictionary mapping column of datetime types to the stata internal
1480-
format that you want to use for the dates. Options are
1481-
'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a
1482-
number or a name.
1479+
Dictionary mapping columns containing datetime types to Stata
1480+
internal format to use when writing the dates. Options are 'tc',
1481+
'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an
1482+
integer or a name. Datetime columns that do not have a conversion
1483+
type specified will be converted to 'tc'. Datetime columns with
1484+
timezone information are not supported.
14831485
write_index : bool
14841486
Write the index to Stata dataset.
14851487
encoding : str
1486-
Default is latin-1. Note that Stata does not support unicode.
1488+
Default is latin-1. Unicode is not supported
14871489
byteorder : str
1488-
Can be ">", "<", "little", or "big". The default is None which uses
1489-
`sys.byteorder`
1490+
Can be ">", "<", "little", or "big". default is `sys.byteorder`
14901491
time_stamp : datetime
1491-
A date time to use when writing the file. Can be None, in which
1492-
case the current time is used.
1492+
A datetime to use as file creation date. Default is the current
1493+
time
14931494
dataset_label : str
1494-
A label for the data set. Should be 80 characters or smaller.
1495+
A label for the data set. Must be 80 characters or smaller.
14951496
14961497
.. versionadded:: 0.19.0
14971498
14981499
variable_labels : dict
14991500
Dictionary containing columns as keys and variable labels as
15001501
values. Each label must be 80 characters or smaller.
15011502
1503+
Raises
1504+
------
1505+
NotImplementedError
1506+
* If datetimes contain timezone information
1507+
* Column dtype is not representable in Stata
1508+
ValueError
1509+
* Columns listed in convert_dates are contain values other than
1510+
datetime64[ns] or datetime.datetime
1511+
* Column listed in convert_dates is not in DataFrame
1512+
* Categorical label contains more than 32,000 characters
1513+
15021514
Examples
15031515
--------
15041516
>>> writer = StataWriter('./data_file.dta', data)

pandas/io/stata.py

Lines changed: 51 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,8 @@ def parse_dates_safe(dates, delta=False, year=False, days=False):
432432
d = parse_dates_safe(dates, year=True)
433433
conv_dates = d.year
434434
else:
435-
raise ValueError("fmt %s not understood" % fmt)
435+
raise NotImplementedError("Conversion from format %s "
436+
"is not implemented" % fmt)
436437

437438
conv_dates = Series(conv_dates, dtype=np.float64)
438439
missing_value = struct.unpack('<d', b'\x00\x00\x00\x00\x00\x00\xe0\x7f')[0]
@@ -1709,7 +1710,7 @@ def _convert_datetime_to_stata_type(fmt):
17091710
"%tq", "th", "%th", "ty", "%ty"]:
17101711
return np.float64 # Stata expects doubles for SIFs
17111712
else:
1712-
raise ValueError("fmt %s not understood" % fmt)
1713+
raise NotImplementedError("Format %s not implemented" % fmt)
17131714

17141715

17151716
def _maybe_convert_to_int_keys(convert_dates, varlist):
@@ -1721,9 +1722,8 @@ def _maybe_convert_to_int_keys(convert_dates, varlist):
17211722
new_dict.update({varlist.index(key): convert_dates[key]})
17221723
else:
17231724
if not isinstance(key, int):
1724-
raise ValueError(
1725-
"convert_dates key is not in varlist and is not an int"
1726-
)
1725+
raise ValueError("convert_dates key must be a "
1726+
"column or an integer")
17271727
new_dict.update({key: convert_dates[key]})
17281728
return new_dict
17291729

@@ -1763,8 +1763,7 @@ def _dtype_to_stata_type(dtype, column):
17631763
elif dtype == np.int8:
17641764
return chr(251)
17651765
else: # pragma : no cover
1766-
raise ValueError("Data type %s not currently understood. "
1767-
"Please report an error to the developers." % dtype)
1766+
raise NotImplementedError("Data type %s not supported." % dtype)
17681767

17691768

17701769
def _dtype_to_default_stata_fmt(dtype, column):
@@ -1801,35 +1800,36 @@ def _dtype_to_default_stata_fmt(dtype, column):
18011800
elif dtype == np.int8 or dtype == np.int16:
18021801
return "%8.0g"
18031802
else: # pragma : no cover
1804-
raise ValueError("Data type %s not currently understood. "
1805-
"Please report an error to the developers." % dtype)
1803+
raise NotImplementedError("Data type %s not supported." % dtype)
18061804

18071805

18081806
class StataWriter(StataParser):
18091807
"""
1810-
A class for writing Stata binary dta files from array-like objects
1808+
A class for writing Stata binary dta files
18111809
18121810
Parameters
18131811
----------
1814-
fname : file path or buffer
1815-
Where to save the dta file.
1816-
data : array-like
1817-
Array-like input to save. Pandas objects are also accepted.
1812+
fname : str or buffer
1813+
String path of file-like object
1814+
data : DataFrame
1815+
Input to save
18181816
convert_dates : dict
1819-
Dictionary mapping column of datetime types to the stata internal
1820-
format that you want to use for the dates. Options are
1821-
'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a
1822-
number or a name.
1817+
Dictionary mapping columns containing datetime types to Stata internal
1818+
format to use when writing the dates. Options are 'tc', 'td', 'tm',
1819+
'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
1820+
Datetime columns that do not have a conversion type specified will be
1821+
converted to 'tc'. Datetime columns with timezone information are not
1822+
supported.
1823+
write_index : bool
1824+
Write the index to Stata dataset.
18231825
encoding : str
1824-
Default is latin-1. Note that Stata does not support unicode.
1826+
Default is latin-1. Unicode is not supported
18251827
byteorder : str
1826-
Can be ">", "<", "little", or "big". The default is None which uses
1827-
`sys.byteorder`
1828+
Can be ">", "<", "little", or "big". default is `sys.byteorder`
18281829
time_stamp : datetime
1829-
A date time to use when writing the file. Can be None, in which
1830-
case the current time is used.
1830+
A datetime to use as file creation date. Default is the current time
18311831
dataset_label : str
1832-
A label for the data set. Should be 80 characters or smaller.
1832+
A label for the data set. Must be 80 characters or smaller.
18331833
18341834
.. versionadded:: 0.19.0
18351835
@@ -1843,6 +1843,17 @@ class StataWriter(StataParser):
18431843
The StataWriter instance has a write_file method, which will
18441844
write the file to the given `fname`.
18451845
1846+
Raises
1847+
------
1848+
NotImplementedError
1849+
* If datetimes contain timezone information
1850+
* Column dtype is not representable in Stata
1851+
ValueError
1852+
* Columns listed in convert_dates are contain values other than
1853+
datetime64[ns] or datetime.datetime
1854+
* Column listed in convert_dates is not in DataFrame
1855+
* Categorical label contains more than 32,000 characters
1856+
18461857
Examples
18471858
--------
18481859
>>> import pandas as pd
@@ -1861,7 +1872,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True,
18611872
encoding="latin-1", byteorder=None, time_stamp=None,
18621873
data_label=None, variable_labels=None):
18631874
super(StataWriter, self).__init__(encoding)
1864-
self._convert_dates = convert_dates
1875+
self._convert_dates = {} if convert_dates is None else convert_dates
18651876
self._write_index = write_index
18661877
self._time_stamp = time_stamp
18671878
self._data_label = data_label
@@ -2041,15 +2052,22 @@ def _prepare_pandas(self, data):
20412052
self.varlist = data.columns.tolist()
20422053

20432054
dtypes = data.dtypes
2044-
if self._convert_dates is not None:
2045-
self._convert_dates = _maybe_convert_to_int_keys(
2046-
self._convert_dates, self.varlist
2055+
2056+
# Ensure all date columns are converted
2057+
for col in data:
2058+
if col in self._convert_dates:
2059+
continue
2060+
if is_datetime64_dtype(data[col]):
2061+
self._convert_dates[col] = 'tc'
2062+
2063+
self._convert_dates = _maybe_convert_to_int_keys(self._convert_dates,
2064+
self.varlist)
2065+
for key in self._convert_dates:
2066+
new_type = _convert_datetime_to_stata_type(
2067+
self._convert_dates[key]
20472068
)
2048-
for key in self._convert_dates:
2049-
new_type = _convert_datetime_to_stata_type(
2050-
self._convert_dates[key]
2051-
)
2052-
dtypes[key] = np.dtype(new_type)
2069+
dtypes[key] = np.dtype(new_type)
2070+
20532071
self.typlist = []
20542072
self.fmtlist = []
20552073
for col, dtype in dtypes.iteritems():

pandas/io/tests/test_stata.py

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,16 @@
1111

1212
import nose
1313
import numpy as np
14-
1514
import pandas as pd
1615
import pandas.util.testing as tm
1716
from pandas import compat
1817
from pandas.compat import iterkeys
1918
from pandas.core.frame import DataFrame, Series
20-
from pandas.types.common import is_categorical_dtype
21-
from pandas.tslib import NaT
2219
from pandas.io.parsers import read_csv
2320
from pandas.io.stata import (read_stata, StataReader, InvalidColumnName,
2421
PossiblePrecisionLoss, StataMissingValue)
22+
from pandas.tslib import NaT
23+
from pandas.types.common import is_categorical_dtype
2524

2625

2726
class TestStata(tm.TestCase):
@@ -1165,6 +1164,52 @@ def test_write_variable_label_errors(self):
11651164
with tm.ensure_clean() as path:
11661165
original.to_stata(path, variable_labels=variable_labels_long)
11671166

1167+
def test_default_date_conversion(self):
1168+
# GH 12259
1169+
dates = [dt.datetime(1999, 12, 31, 12, 12, 12, 12000),
1170+
dt.datetime(2012, 12, 21, 12, 21, 12, 21000),
1171+
dt.datetime(1776, 7, 4, 7, 4, 7, 4000)]
1172+
original = pd.DataFrame({'nums': [1.0, 2.0, 3.0],
1173+
'strs': ['apple', 'banana', 'cherry'],
1174+
'dates': dates})
1175+
1176+
with tm.ensure_clean() as path:
1177+
original.to_stata(path, write_index=False)
1178+
reread = read_stata(path, convert_dates=True)
1179+
tm.assert_frame_equal(original, reread)
1180+
1181+
original.to_stata(path,
1182+
write_index=False,
1183+
convert_dates={'dates': 'tc'})
1184+
direct = read_stata(path, convert_dates=True)
1185+
tm.assert_frame_equal(reread, direct)
1186+
1187+
def test_unsupported_type(self):
1188+
original = pd.DataFrame({'a': [1 + 2j, 2 + 4j]})
1189+
1190+
with tm.assertRaises(NotImplementedError):
1191+
with tm.ensure_clean() as path:
1192+
original.to_stata(path)
1193+
1194+
def test_unsupported_datetype(self):
1195+
dates = [dt.datetime(1999, 12, 31, 12, 12, 12, 12000),
1196+
dt.datetime(2012, 12, 21, 12, 21, 12, 21000),
1197+
dt.datetime(1776, 7, 4, 7, 4, 7, 4000)]
1198+
original = pd.DataFrame({'nums': [1.0, 2.0, 3.0],
1199+
'strs': ['apple', 'banana', 'cherry'],
1200+
'dates': dates})
1201+
1202+
with tm.assertRaises(NotImplementedError):
1203+
with tm.ensure_clean() as path:
1204+
original.to_stata(path, convert_dates={'dates': 'tC'})
1205+
1206+
dates = pd.date_range('1-1-1990', periods=3, tz='Asia/Hong_Kong')
1207+
original = pd.DataFrame({'nums': [1.0, 2.0, 3.0],
1208+
'strs': ['apple', 'banana', 'cherry'],
1209+
'dates': dates})
1210+
with tm.assertRaises(NotImplementedError):
1211+
with tm.ensure_clean() as path:
1212+
original.to_stata(path)
11681213

11691214
if __name__ == '__main__':
11701215
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

0 commit comments

Comments
 (0)