Skip to content

Commit a367e9b

Browse files
committed
Merge pull request #7450 from bashtage/stata-13-strings
FIX: Enable fixed width strings to be read from Stata 13 (117) files
2 parents 38ca1e0 + 904933a commit a367e9b

File tree

5 files changed

+26
-4
lines changed

5 files changed

+26
-4
lines changed

doc/source/v0.14.1.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,8 @@ Bug Fixes
220220
- Bug where ``nanops._has_infs`` doesn't work with many dtypes
221221
(:issue:`7357`)
222222
- Bug in ``StataReader.data`` where reading a 0-observation dta failed (:issue:`7369`)
223+
- Bug in when reading Stata 13 (117) files containing fixed width strings (:issue:`7360`)
224+
- Bug in when writing Stata files where the encoding was ignored (:issue:`7286`)
223225

224226

225227

pandas/io/stata.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -512,8 +512,10 @@ def _read_header(self):
512512
try:
513513
i = 0
514514
for typ in typlist:
515-
if typ <= 2045 or typ == 32768:
516-
self.typlist[i] = None
515+
if typ <= 2045:
516+
self.typlist[i] = typ
517+
elif typ == 32768:
518+
raise ValueError("Long strings are not supported")
517519
else:
518520
self.typlist[i] = self.TYPE_MAP_XML[typ]
519521
i += 1
@@ -1326,7 +1328,10 @@ def _write_data_nodates(self):
13261328
var = _pad_bytes('', typ)
13271329
if len(var) < typ:
13281330
var = _pad_bytes(var, typ)
1329-
self._write(var)
1331+
if compat.PY3:
1332+
self._write(var)
1333+
else:
1334+
self._write(var.encode(self._encoding))
13301335
else:
13311336
try:
13321337
self._file.write(struct.pack(byteorder + TYPE_MAP[typ],
@@ -1356,7 +1361,10 @@ def _write_data_dates(self):
13561361
if typ <= 244: # we've got a string
13571362
if len(var) < typ:
13581363
var = _pad_bytes(var, typ)
1359-
self._write(var)
1364+
if compat.PY3:
1365+
self._write(var)
1366+
else:
1367+
self._write(var.encode(self._encoding))
13601368
else:
13611369
self._file.write(struct.pack(byteorder+TYPE_MAP[typ], var))
13621370

pandas/io/tests/data/stata5_117.dta

5.24 KB
Binary file not shown.

pandas/io/tests/data/stata6_117.dta

3.41 KB
Binary file not shown.

pandas/io/tests/test_stata.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,11 +60,13 @@ def setUp(self):
6060
self.dta14_113 = os.path.join(self.dirpath, 'stata5_113.dta')
6161
self.dta14_114 = os.path.join(self.dirpath, 'stata5_114.dta')
6262
self.dta14_115 = os.path.join(self.dirpath, 'stata5_115.dta')
63+
self.dta14_117 = os.path.join(self.dirpath, 'stata5_117.dta')
6364

6465
self.csv15 = os.path.join(self.dirpath, 'stata6.csv')
6566
self.dta15_113 = os.path.join(self.dirpath, 'stata6_113.dta')
6667
self.dta15_114 = os.path.join(self.dirpath, 'stata6_114.dta')
6768
self.dta15_115 = os.path.join(self.dirpath, 'stata6_115.dta')
69+
self.dta15_117 = os.path.join(self.dirpath, 'stata6_117.dta')
6870

6971
def read_dta(self, file):
7072
return read_stata(file, convert_dates=True)
@@ -281,6 +283,11 @@ def test_encoding(self):
281283
self.assertEqual(result, expected)
282284
self.assertIsInstance(result, unicode)
283285

286+
with tm.ensure_clean() as path:
287+
encoded.to_stata(path,encoding='latin-1', write_index=False)
288+
reread_encoded = read_stata(path, encoding='latin-1')
289+
tm.assert_frame_equal(encoded, reread_encoded)
290+
284291
def test_read_write_dta11(self):
285292
original = DataFrame([(1, 2, 3, 4)],
286293
columns=['good', compat.u('b\u00E4d'), '8number', 'astringwithmorethan32characters______'])
@@ -354,9 +361,12 @@ def test_read_write_reread_dta14(self):
354361
parsed_114.index.name = 'index'
355362
parsed_115 = self.read_dta(self.dta14_115)
356363
parsed_115.index.name = 'index'
364+
parsed_117 = self.read_dta(self.dta14_117)
365+
parsed_117.index.name = 'index'
357366

358367
tm.assert_frame_equal(parsed_114, parsed_113)
359368
tm.assert_frame_equal(parsed_114, parsed_115)
369+
tm.assert_frame_equal(parsed_114, parsed_117)
360370

361371
with tm.ensure_clean() as path:
362372
parsed_114.to_stata(path, {'date_td': 'td'})
@@ -375,10 +385,12 @@ def test_read_write_reread_dta15(self):
375385
parsed_113 = self.read_dta(self.dta15_113)
376386
parsed_114 = self.read_dta(self.dta15_114)
377387
parsed_115 = self.read_dta(self.dta15_115)
388+
parsed_117 = self.read_dta(self.dta15_117)
378389

379390
tm.assert_frame_equal(expected, parsed_114)
380391
tm.assert_frame_equal(parsed_113, parsed_114)
381392
tm.assert_frame_equal(parsed_114, parsed_115)
393+
tm.assert_frame_equal(parsed_114, parsed_117)
382394

383395
def test_timestamp_and_label(self):
384396
original = DataFrame([(1,)], columns=['var'])

0 commit comments

Comments
 (0)