Skip to content

FIX: Enable fixed width strings to be read from Stata 13 (117) files #7450

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 16, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/v0.14.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,8 @@ Bug Fixes
- Bug where ``nanops._has_infs`` doesn't work with many dtypes
(:issue:`7357`)
- Bug in ``StataReader.data`` where reading a 0-observation dta failed (:issue:`7369`)
- Bug in when reading Stata 13 (117) files containing fixed width strings (:issue:`7360`)
- Bug in when writing Stata files where the encoding was ignored (:issue:`7286`)



Expand Down
16 changes: 12 additions & 4 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,8 +512,10 @@ def _read_header(self):
try:
i = 0
for typ in typlist:
if typ <= 2045 or typ == 32768:
self.typlist[i] = None
if typ <= 2045:
self.typlist[i] = typ
elif typ == 32768:
raise ValueError("Long strings are not supported")
else:
self.typlist[i] = self.TYPE_MAP_XML[typ]
i += 1
Expand Down Expand Up @@ -1326,7 +1328,10 @@ def _write_data_nodates(self):
var = _pad_bytes('', typ)
if len(var) < typ:
var = _pad_bytes(var, typ)
self._write(var)
if compat.PY3:
self._write(var)
else:
self._write(var.encode(self._encoding))
else:
try:
self._file.write(struct.pack(byteorder + TYPE_MAP[typ],
Expand Down Expand Up @@ -1356,7 +1361,10 @@ def _write_data_dates(self):
if typ <= 244: # we've got a string
if len(var) < typ:
var = _pad_bytes(var, typ)
self._write(var)
if compat.PY3:
self._write(var)
else:
self._write(var.encode(self._encoding))
else:
self._file.write(struct.pack(byteorder+TYPE_MAP[typ], var))

Expand Down
Binary file added pandas/io/tests/data/stata5_117.dta
Binary file not shown.
Binary file added pandas/io/tests/data/stata6_117.dta
Binary file not shown.
12 changes: 12 additions & 0 deletions pandas/io/tests/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,13 @@ def setUp(self):
self.dta14_113 = os.path.join(self.dirpath, 'stata5_113.dta')
self.dta14_114 = os.path.join(self.dirpath, 'stata5_114.dta')
self.dta14_115 = os.path.join(self.dirpath, 'stata5_115.dta')
self.dta14_117 = os.path.join(self.dirpath, 'stata5_117.dta')

self.csv15 = os.path.join(self.dirpath, 'stata6.csv')
self.dta15_113 = os.path.join(self.dirpath, 'stata6_113.dta')
self.dta15_114 = os.path.join(self.dirpath, 'stata6_114.dta')
self.dta15_115 = os.path.join(self.dirpath, 'stata6_115.dta')
self.dta15_117 = os.path.join(self.dirpath, 'stata6_117.dta')

def read_dta(self, file):
return read_stata(file, convert_dates=True)
Expand Down Expand Up @@ -281,6 +283,11 @@ def test_encoding(self):
self.assertEqual(result, expected)
self.assertIsInstance(result, unicode)

with tm.ensure_clean() as path:
encoded.to_stata(path,encoding='latin-1', write_index=False)
reread_encoded = read_stata(path, encoding='latin-1')
tm.assert_frame_equal(encoded, reread_encoded)

def test_read_write_dta11(self):
original = DataFrame([(1, 2, 3, 4)],
columns=['good', compat.u('b\u00E4d'), '8number', 'astringwithmorethan32characters______'])
Expand Down Expand Up @@ -354,9 +361,12 @@ def test_read_write_reread_dta14(self):
parsed_114.index.name = 'index'
parsed_115 = self.read_dta(self.dta14_115)
parsed_115.index.name = 'index'
parsed_117 = self.read_dta(self.dta14_117)
parsed_117.index.name = 'index'

tm.assert_frame_equal(parsed_114, parsed_113)
tm.assert_frame_equal(parsed_114, parsed_115)
tm.assert_frame_equal(parsed_114, parsed_117)

with tm.ensure_clean() as path:
parsed_114.to_stata(path, {'date_td': 'td'})
Expand All @@ -375,10 +385,12 @@ def test_read_write_reread_dta15(self):
parsed_113 = self.read_dta(self.dta15_113)
parsed_114 = self.read_dta(self.dta15_114)
parsed_115 = self.read_dta(self.dta15_115)
parsed_117 = self.read_dta(self.dta15_117)

tm.assert_frame_equal(expected, parsed_114)
tm.assert_frame_equal(parsed_113, parsed_114)
tm.assert_frame_equal(parsed_114, parsed_115)
tm.assert_frame_equal(parsed_114, parsed_117)

def test_timestamp_and_label(self):
original = DataFrame([(1,)], columns=['var'])
Expand Down