Skip to content

Commit 0638be8

Browse files
committed
ENH: Allow timestamp and data label to be set when exporting to Stata
Added code which allows the time stamp and the data label to be set using either StataWriter or to_stata. Also simplified reading these values using StataReader by removing null bytes from the string values read. Added basic test for both. Also fixed one small bug where variables could be stored using Stata reserved words.
1 parent 170377d commit 0638be8

File tree

5 files changed

+67
-17
lines changed

5 files changed

+67
-17
lines changed

doc/source/release.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ Improvements to existing features
147147
- perf improvements in DataFrame construction with certain offsets, by removing faulty caching
148148
(e.g. MonthEnd,BusinessMonthEnd), (:issue:`6479`)
149149
- perf improvements in single-dtyped indexing (:issue:`6484`)
150+
- ``StataWriter`` and ``DataFrame.to_stata`` accept time stamp and data labels (:issue:`6545`)
150151

151152
.. _release.bug_fixes-0.14.0:
152153

doc/source/v0.14.0.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,9 @@ Enhancements
312312
- ``DataFrame.to_stata`` will now check data for compatibility with Stata data types
313313
and will upcast when needed. When it isn't possibly to losslessly upcast, a warning
314314
is raised (:issue:`6327`)
315+
- ``DataFrame.to_stata`` and ``StataWriter`` will accept keyword arguments time_stamp
316+
and data_label which allow the time stamp and dataset label to be set when creating a
317+
file. (:issue:`6545`)
315318

316319
Performance
317320
~~~~~~~~~~~

pandas/core/frame.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1216,7 +1216,7 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
12161216

12171217
def to_stata(
12181218
self, fname, convert_dates=None, write_index=True, encoding="latin-1",
1219-
byteorder=None):
1219+
byteorder=None, time_stamp=None, data_label=None):
12201220
"""
12211221
A class for writing Stata binary dta files from array-like objects
12221222
@@ -1247,7 +1247,8 @@ def to_stata(
12471247
"""
12481248
from pandas.io.stata import StataWriter
12491249
writer = StataWriter(fname, self, convert_dates=convert_dates,
1250-
encoding=encoding, byteorder=byteorder)
1250+
encoding=encoding, byteorder=byteorder,
1251+
time_stamp=time_stamp, data_label=data_label)
12511252
writer.write_file()
12521253

12531254
def to_sql(self, name, con, flavor='sqlite', if_exists='fail', **kwargs):

pandas/io/stata.py

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,18 @@ def __init__(self, encoding):
375375
'd': np.float64(struct.unpack('<d', b'\x00\x00\x00\x00\x00\x00\xe0\x7f')[0])
376376
}
377377

378+
# Reserved words cannot be used as variable names
379+
self.RESERVED_WORDS = ('aggregate', 'array', 'boolean', 'break',
380+
'byte', 'case', 'catch', 'class', 'colvector',
381+
'complex', 'const', 'continue', 'default',
382+
'delegate', 'delete', 'do', 'double', 'else',
383+
'eltypedef', 'end', 'enum', 'explicit',
384+
'export', 'external', 'float', 'for', 'friend',
385+
'function', 'global', 'goto', 'if', 'inline',
386+
'int', 'local', 'long', 'NULL', 'pragma',
387+
'protected', 'quad', 'rowvector', 'short',
388+
'typedef', 'typename', 'virtual')
389+
378390
def _decode_bytes(self, str, errors=None):
379391
if compat.PY3 or self._encoding is not None:
380392
return str.decode(self._encoding, errors)
@@ -449,10 +461,10 @@ def _read_header(self):
449461
self.path_or_buf.read(4))[0]
450462
self.path_or_buf.read(11) # </N><label>
451463
strlen = struct.unpack('b', self.path_or_buf.read(1))[0]
452-
self.data_label = self.path_or_buf.read(strlen)
464+
self.data_label = self._null_terminate(self.path_or_buf.read(strlen))
453465
self.path_or_buf.read(19) # </label><timestamp>
454466
strlen = struct.unpack('b', self.path_or_buf.read(1))[0]
455-
self.time_stamp = self.path_or_buf.read(strlen)
467+
self.time_stamp = self._null_terminate(self.path_or_buf.read(strlen))
456468
self.path_or_buf.read(26) # </timestamp></header><map>
457469
self.path_or_buf.read(8) # 0x0000000000000000
458470
self.path_or_buf.read(8) # position of <map>
@@ -543,11 +555,11 @@ def _read_header(self):
543555
self.nobs = struct.unpack(self.byteorder + 'I',
544556
self.path_or_buf.read(4))[0]
545557
if self.format_version > 105:
546-
self.data_label = self.path_or_buf.read(81)
558+
self.data_label = self._null_terminate(self.path_or_buf.read(81))
547559
else:
548-
self.data_label = self.path_or_buf.read(32)
560+
self.data_label = self._null_terminate(self.path_or_buf.read(32))
549561
if self.format_version > 104:
550-
self.time_stamp = self.path_or_buf.read(18)
562+
self.time_stamp = self._null_terminate(self.path_or_buf.read(18))
551563

552564
# descriptors
553565
if self.format_version > 108:
@@ -1029,6 +1041,11 @@ class StataWriter(StataParser):
10291041
byteorder : str
10301042
Can be ">", "<", "little", or "big". The default is None which uses
10311043
`sys.byteorder`
1044+
time_stamp : datetime
1045+
A date time to use when writing the file. Can be None, in which
1046+
case the current time is used.
1047+
dataset_label : str
1048+
A label for the data set. Should be 80 characters or smaller.
10321049
10331050
Returns
10341051
-------
@@ -1047,10 +1064,13 @@ class StataWriter(StataParser):
10471064
>>> writer.write_file()
10481065
"""
10491066
def __init__(self, fname, data, convert_dates=None, write_index=True,
1050-
encoding="latin-1", byteorder=None):
1067+
encoding="latin-1", byteorder=None, time_stamp=None,
1068+
data_label=None):
10511069
super(StataWriter, self).__init__(encoding)
10521070
self._convert_dates = convert_dates
10531071
self._write_index = write_index
1072+
self._time_stamp = time_stamp
1073+
self._data_label = data_label
10541074
# attach nobs, nvars, data, varlist, typlist
10551075
self._prepare_pandas(data)
10561076

@@ -1086,7 +1106,7 @@ def __iter__(self):
10861106

10871107
if self._write_index:
10881108
data = data.reset_index()
1089-
# Check columns for compatbaility with stata
1109+
# Check columns for compatibility with stata
10901110
data = _cast_to_stata_types(data)
10911111
self.datarows = DataFrameRowIter(data)
10921112
self.nobs, self.nvar = data.shape
@@ -1110,7 +1130,8 @@ def __iter__(self):
11101130
self.fmtlist[key] = self._convert_dates[key]
11111131

11121132
def write_file(self):
1113-
self._write_header()
1133+
self._write_header(time_stamp=self._time_stamp,
1134+
data_label=self._data_label)
11141135
self._write_descriptors()
11151136
self._write_variable_labels()
11161137
# write 5 zeros for expansion fields
@@ -1147,7 +1168,7 @@ def _write_header(self, data_label=None, time_stamp=None):
11471168
# format dd Mon yyyy hh:mm
11481169
if time_stamp is None:
11491170
time_stamp = datetime.datetime.now()
1150-
elif not isinstance(time_stamp, datetime):
1171+
elif not isinstance(time_stamp, datetime.datetime):
11511172
raise ValueError("time_stamp should be datetime type")
11521173
self._file.write(
11531174
self._null_terminate(time_stamp.strftime("%d %b %Y %H:%M"))
@@ -1169,7 +1190,9 @@ def _write_descriptors(self, typlist=None, varlist=None, srtlist=None,
11691190
for c in name:
11701191
if (c < 'A' or c > 'Z') and (c < 'a' or c > 'z') and (c < '0' or c > '9') and c != '_':
11711192
name = name.replace(c, '_')
1172-
1193+
# Variable name must not be a reserved word
1194+
if name in self.RESERVED_WORDS:
1195+
name = '_' + name
11731196
# Variable name may not start with a number
11741197
if name[0] > '0' and name[0] < '9':
11751198
name = '_' + name

pandas/io/tests/test_stata.py

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# pylint: disable=E1101
22

33
from datetime import datetime
4+
import datetime as dt
45
import os
56
import warnings
67
import nose
@@ -248,7 +249,7 @@ def test_read_write_dta10(self):
248249

249250
original = DataFrame(data=[["string", "object", 1, 1.1,
250251
np.datetime64('2003-12-25')]],
251-
columns=['string', 'object', 'integer', 'float',
252+
columns=['string', 'object', 'integer', 'floating',
252253
'datetime'])
253254
original["object"] = Series(original["object"], dtype=object)
254255
original.index.name = 'index'
@@ -304,10 +305,20 @@ def test_read_write_dta11(self):
304305
def test_read_write_dta12(self):
305306
# skip_if_not_little_endian()
306307

307-
original = DataFrame([(1, 2, 3, 4)],
308-
columns=['astringwithmorethan32characters_1', 'astringwithmorethan32characters_2', '+', '-'])
309-
formatted = DataFrame([(1, 2, 3, 4)],
310-
columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_'])
308+
original = DataFrame([(1, 2, 3, 4, 5, 6)],
309+
columns=['astringwithmorethan32characters_1',
310+
'astringwithmorethan32characters_2',
311+
'+',
312+
'-',
313+
'short',
314+
'delete'])
315+
formatted = DataFrame([(1, 2, 3, 4, 5, 6)],
316+
columns=['astringwithmorethan32characters_',
317+
'_0astringwithmorethan32character',
318+
'_',
319+
'_1_',
320+
'_short',
321+
'_delete'])
311322
formatted.index.name = 'index'
312323
formatted = formatted.astype(np.int32)
313324

@@ -376,6 +387,17 @@ def test_read_write_reread_dta15(self):
376387
tm.assert_frame_equal(parsed_113, parsed_114)
377388
tm.assert_frame_equal(parsed_114, parsed_115)
378389

390+
def test_timestamp_and_label(self):
391+
original = DataFrame([(1,)], columns=['var'])
392+
time_stamp = datetime(2000, 2, 29, 14, 21)
393+
data_label = 'This is a data file.'
394+
with tm.ensure_clean() as path:
395+
original.to_stata(path, time_stamp=time_stamp, data_label=data_label)
396+
reader = StataReader(path)
397+
parsed_time_stamp = dt.datetime.strptime(reader.time_stamp, ('%d %b %Y %H:%M'))
398+
assert parsed_time_stamp == time_stamp
399+
assert reader.data_label == data_label
400+
379401

380402

381403
if __name__ == '__main__':

0 commit comments

Comments
 (0)