Skip to content

WIP/ENH: reading older Stata versions, #11526 #14159

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@ Other enhancements





.. _whatsnew_0200.api_breaking:

Backwards incompatible API changes
Expand Down Expand Up @@ -82,3 +80,5 @@ Performance Improvements
Bug Fixes
~~~~~~~~~

`read_stata` can now handle some format 111 files, which are produced
by SAS when generating Stata dta files (:issue:11526, :issue:14159)
6 changes: 3 additions & 3 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@
from pandas.tslib import NaT, Timestamp

_version_error = ("Version of given Stata file is not 104, 105, 108, "
"113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12), "
"117 (Stata 13), or 118 (Stata 14)")
"111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
"115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)")

_statafile_processing_params1 = """\
convert_dates : boolean, defaults to True
Expand Down Expand Up @@ -1183,7 +1183,7 @@ def _get_seek_variable_labels(self):

def _read_old_header(self, first_char):
self.format_version = struct.unpack('b', first_char)[0]
if self.format_version not in [104, 105, 108, 113, 114, 115]:
if self.format_version not in [104, 105, 108, 111, 113, 114, 115]:
raise ValueError(_version_error)
self.byteorder = struct.unpack('b', self.path_or_buf.read(1))[
0] == 0x1 and '>' or '<'
Expand Down
Binary file added pandas/io/tests/data/stata7_111.dta
Binary file not shown.
16 changes: 16 additions & 0 deletions pandas/io/tests/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ def setUp(self):
self.dta22_118 = os.path.join(self.dirpath, 'stata14_118.dta')
self.dta23 = os.path.join(self.dirpath, 'stata15.dta')

self.dta24_111 = os.path.join(self.dirpath, 'stata7_111.dta')

def read_dta(self, file):
# Legacy default reader configuration
return read_stata(file, convert_dates=True)
Expand Down Expand Up @@ -1219,6 +1221,20 @@ def test_repeated_column_labels(self):
read_stata(self.dta23, convert_categoricals=True)
tm.assertTrue('wolof' in cm.exception)

def test_stata_111(self):
# 111 is an old version but still used by current versions of
# SAS when exporting to Stata format. We do not know of any
# on-line documentation for this version.
df = read_stata(self.dta24_111)
original = pd.DataFrame({'y': [1, 1, 1, 1, 1, 0, 0, np.NaN, 0, 0],
'x': [1, 2, 1, 3, np.NaN, 4, 3, 5, 1, 6],
'w': [2, np.NaN, 5, 2, 4, 4, 3, 1, 2, 3],
'z': ['a', 'b', 'c', 'd', 'e', '', 'g', 'h',
'i', 'j']})
original = original[['y', 'x', 'w', 'z']]
tm.assert_frame_equal(original, df)


if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)