ENH: Enable automatic writing of dates to Stata files

bashtage · bashtage · commit 73f6565d0584 · 2016-07-20T12:15:16.000+01:00
Automatically select type %tc for datetime[ns] columns
Change ValueErrors to NotImplementedError for unsupported types
Add tests for select exceptions
Improve to_stata and StataWriter docstrings
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -251,6 +251,7 @@ Other enhancements
 - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`)
 - ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`)
 - ``.to_stata()`` and ```StataWriter`` can now write variable labels to Stata dta files using a dictionary to make column names to labels (:issue:`13535`, :issue:`13536`)
+- ``.to_stata()`` and ```StataWriter`` will automatically convert ``datetime[ns]`` columns to Stata format ``%tc`` rather than raising a ``ValueError`` (:issue:`12259`)
 
 .. _whatsnew_0190.api:
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1473,32 +1473,44 @@ def to_stata(self, fname, convert_dates=None, write_index=True,
 
         Parameters
         ----------
-        fname : file path or buffer
-            Where to save the dta file.
+        fname : str or buffer
+            String path of file-like object
         convert_dates : dict
-            Dictionary mapping column of datetime types to the stata internal
-            format that you want to use for the dates. Options are
-            'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a
-            number or a name.
+            Dictionary mapping columns containing datetime types to Stata
+            internal format to use when writing the dates. Options are 'tc',
+            'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an
+            integer or a name. Datetime columns that do not have a conversion
+            type specified will be converted to 'tc'. Datetime columns with
+            timezone information are not supported.
         write_index : bool
             Write the index to Stata dataset.
         encoding : str
-            Default is latin-1. Note that Stata does not support unicode.
+            Default is latin-1. Unicode is not supported
         byteorder : str
-            Can be ">", "<", "little", or "big". The default is None which uses
-            `sys.byteorder`
+            Can be ">", "<", "little", or "big". default is `sys.byteorder`
         time_stamp : datetime
-            A date time to use when writing the file.  Can be None, in which
-            case the current time is used.
+            A datetime to use as file creation date.  Default is the current
+            time
         dataset_label : str
-            A label for the data set.  Should be 80 characters or smaller.
+            A label for the data set.  Must be 80 characters or smaller.
 
         .. versionadded:: 0.19.0
 
         variable_labels : dict
             Dictionary containing columns as keys and variable labels as
             values. Each label must be 80 characters or smaller.
 
+        Raises
+        ------
+        NotImplementedError
+            * If datetimes contain timezone information
+            * Column dtype is not representable in Stata
+        ValueError
+            * Columns listed in convert_dates are contain values other than
+              datetime64[ns] or datetime.datetime
+            * Column listed in convert_dates is not in DataFrame
+            * Categorical label contains more than 32,000 characters
+
         Examples
         --------
         >>> writer = StataWriter('./data_file.dta', data)
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -432,7 +432,8 @@ def parse_dates_safe(dates, delta=False, year=False, days=False):
         d = parse_dates_safe(dates, year=True)
         conv_dates = d.year
     else:
-        raise ValueError("fmt %s not understood" % fmt)
+        raise NotImplementedError("Conversion from format %s "
+                                  "is not implemented" % fmt)
 
     conv_dates = Series(conv_dates, dtype=np.float64)
     missing_value = struct.unpack('<d', b'\x00\x00\x00\x00\x00\x00\xe0\x7f')[0]
@@ -1709,7 +1710,7 @@ def _convert_datetime_to_stata_type(fmt):
                "%tq", "th", "%th", "ty", "%ty"]:
         return np.float64  # Stata expects doubles for SIFs
     else:
-        raise ValueError("fmt %s not understood" % fmt)
+        raise NotImplementedError("Format %s not implemented" % fmt)
 
 
 def _maybe_convert_to_int_keys(convert_dates, varlist):
@@ -1721,9 +1722,8 @@ def _maybe_convert_to_int_keys(convert_dates, varlist):
             new_dict.update({varlist.index(key): convert_dates[key]})
         else:
             if not isinstance(key, int):
-                raise ValueError(
-                    "convert_dates key is not in varlist and is not an int"
-                )
+                raise ValueError("convert_dates key must be a "
+                                 "column or an integer")
             new_dict.update({key: convert_dates[key]})
     return new_dict
 
@@ -1763,8 +1763,7 @@ def _dtype_to_stata_type(dtype, column):
     elif dtype == np.int8:
         return chr(251)
     else:  # pragma : no cover
-        raise ValueError("Data type %s not currently understood. "
-                         "Please report an error to the developers." % dtype)
+        raise NotImplementedError("Data type %s not supported." % dtype)
 
 
 def _dtype_to_default_stata_fmt(dtype, column):
@@ -1801,35 +1800,36 @@ def _dtype_to_default_stata_fmt(dtype, column):
     elif dtype == np.int8 or dtype == np.int16:
         return "%8.0g"
     else:  # pragma : no cover
-        raise ValueError("Data type %s not currently understood. "
-                         "Please report an error to the developers." % dtype)
+        raise NotImplementedError("Data type %s not supported." % dtype)
 
 
 class StataWriter(StataParser):
     """
-    A class for writing Stata binary dta files from array-like objects
+    A class for writing Stata binary dta files
 
     Parameters
     ----------
-    fname : file path or buffer
-        Where to save the dta file.
-    data : array-like
-        Array-like input to save. Pandas objects are also accepted.
+    fname : str or buffer
+        String path of file-like object
+    data : DataFrame
+        Input to save
     convert_dates : dict
-        Dictionary mapping column of datetime types to the stata internal
-        format that you want to use for the dates. Options are
-        'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a
-        number or a name.
+        Dictionary mapping columns containing datetime types to Stata internal
+        format to use when writing the dates. Options are 'tc', 'td', 'tm',
+        'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
+        Datetime columns that do not have a conversion type specified will be
+        converted to 'tc'. Datetime columns with timezone information are not
+        supported.
+    write_index : bool
+        Write the index to Stata dataset.
     encoding : str
-        Default is latin-1. Note that Stata does not support unicode.
+        Default is latin-1. Unicode is not supported
     byteorder : str
-        Can be ">", "<", "little", or "big". The default is None which uses
-        `sys.byteorder`
+        Can be ">", "<", "little", or "big". default is `sys.byteorder`
     time_stamp : datetime
-        A date time to use when writing the file.  Can be None, in which
-        case the current time is used.
+        A datetime to use as file creation date.  Default is the current time
     dataset_label : str
-        A label for the data set.  Should be 80 characters or smaller.
+        A label for the data set.  Must be 80 characters or smaller.
 
     .. versionadded:: 0.19.0
 
@@ -1843,6 +1843,17 @@ class StataWriter(StataParser):
         The StataWriter instance has a write_file method, which will
         write the file to the given `fname`.
 
+    Raises
+    ------
+    NotImplementedError
+        * If datetimes contain timezone information
+        * Column dtype is not representable in Stata
+    ValueError
+        * Columns listed in convert_dates are contain values other than
+          datetime64[ns] or datetime.datetime
+        * Column listed in convert_dates is not in DataFrame
+        * Categorical label contains more than 32,000 characters
+
     Examples
     --------
     >>> import pandas as pd
@@ -1861,7 +1872,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True,
                  encoding="latin-1", byteorder=None, time_stamp=None,
                  data_label=None, variable_labels=None):
         super(StataWriter, self).__init__(encoding)
-        self._convert_dates = convert_dates
+        self._convert_dates = {} if convert_dates is None else convert_dates
         self._write_index = write_index
         self._time_stamp = time_stamp
         self._data_label = data_label
@@ -2041,15 +2052,22 @@ def _prepare_pandas(self, data):
         self.varlist = data.columns.tolist()
 
         dtypes = data.dtypes
-        if self._convert_dates is not None:
-            self._convert_dates = _maybe_convert_to_int_keys(
-                self._convert_dates, self.varlist
+
+        # Ensure all date columns are converted
+        for col in data:
+            if col in self._convert_dates:
+                continue
+            if is_datetime64_dtype(data[col]):
+                self._convert_dates[col] = 'tc'
+
+        self._convert_dates = _maybe_convert_to_int_keys(self._convert_dates,
+                                                         self.varlist)
+        for key in self._convert_dates:
+            new_type = _convert_datetime_to_stata_type(
+                self._convert_dates[key]
             )
-            for key in self._convert_dates:
-                new_type = _convert_datetime_to_stata_type(
-                    self._convert_dates[key]
-                )
-                dtypes[key] = np.dtype(new_type)
+            dtypes[key] = np.dtype(new_type)
+
         self.typlist = []
         self.fmtlist = []
         for col, dtype in dtypes.iteritems():
diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py
@@ -11,17 +11,16 @@
 
 import nose
 import numpy as np
-
 import pandas as pd
 import pandas.util.testing as tm
 from pandas import compat
 from pandas.compat import iterkeys
 from pandas.core.frame import DataFrame, Series
-from pandas.types.common import is_categorical_dtype
-from pandas.tslib import NaT
 from pandas.io.parsers import read_csv
 from pandas.io.stata import (read_stata, StataReader, InvalidColumnName,
                              PossiblePrecisionLoss, StataMissingValue)
+from pandas.tslib import NaT
+from pandas.types.common import is_categorical_dtype
 
 
 class TestStata(tm.TestCase):
@@ -1165,6 +1164,52 @@ def test_write_variable_label_errors(self):
             with tm.ensure_clean() as path:
                 original.to_stata(path, variable_labels=variable_labels_long)
 
+    def test_default_date_conversion(self):
+        # GH 12259
+        dates = [dt.datetime(1999, 12, 31, 12, 12, 12, 12000),
+                 dt.datetime(2012, 12, 21, 12, 21, 12, 21000),
+                 dt.datetime(1776, 7, 4, 7, 4, 7, 4000)]
+        original = pd.DataFrame({'nums': [1.0, 2.0, 3.0],
+                                 'strs': ['apple', 'banana', 'cherry'],
+                                 'dates': dates})
+
+        with tm.ensure_clean() as path:
+            original.to_stata(path, write_index=False)
+            reread = read_stata(path, convert_dates=True)
+            tm.assert_frame_equal(original, reread)
+
+            original.to_stata(path,
+                              write_index=False,
+                              convert_dates={'dates': 'tc'})
+            direct = read_stata(path, convert_dates=True)
+            tm.assert_frame_equal(reread, direct)
+
+    def test_unsupported_type(self):
+        original = pd.DataFrame({'a': [1 + 2j, 2 + 4j]})
+
+        with tm.assertRaises(NotImplementedError):
+            with tm.ensure_clean() as path:
+                original.to_stata(path)
+
+    def test_unsupported_datetype(self):
+        dates = [dt.datetime(1999, 12, 31, 12, 12, 12, 12000),
+                 dt.datetime(2012, 12, 21, 12, 21, 12, 21000),
+                 dt.datetime(1776, 7, 4, 7, 4, 7, 4000)]
+        original = pd.DataFrame({'nums': [1.0, 2.0, 3.0],
+                                 'strs': ['apple', 'banana', 'cherry'],
+                                 'dates': dates})
+
+        with tm.assertRaises(NotImplementedError):
+            with tm.ensure_clean() as path:
+                original.to_stata(path, convert_dates={'dates': 'tC'})
+
+        dates = pd.date_range('1-1-1990', periods=3, tz='Asia/Hong_Kong')
+        original = pd.DataFrame({'nums': [1.0, 2.0, 3.0],
+                                 'strs': ['apple', 'banana', 'cherry'],
+                                 'dates': dates})
+        with tm.assertRaises(NotImplementedError):
+            with tm.ensure_clean() as path:
+                original.to_stata(path)
 
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],