Skip to content

BUG: Fixed failure in StataReader when reading variable labels in 117 #7818

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 23, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/v0.15.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ Bug Fixes


- Bug in ``DataFrame.plot`` with ``subplots=True`` may draw unnecessary minor xticks and yticks (:issue:`7801`)

- Bug in ``StataReader`` which did not read variable labels in 117 files due to difference between Stata documentation and implementation (:issue:`7816`)



Expand Down
11 changes: 9 additions & 2 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,8 +520,15 @@ def _read_header(self):
self.byteorder + 'q', self.path_or_buf.read(8))[0] + 9
seek_value_label_names = struct.unpack(
self.byteorder + 'q', self.path_or_buf.read(8))[0] + 19
seek_variable_labels = struct.unpack(
self.byteorder + 'q', self.path_or_buf.read(8))[0] + 17
# Stata 117 data files do not follow the described format. This is
# a work around that uses the previous label, 33 bytes for each
# variable, 20 for the closing tag and 17 for the opening tag
self.path_or_buf.read(8) # <variable_lables>, throw away
seek_variable_labels = seek_value_label_names + (33*self.nvar) + 20 + 17
# Below is the original, correct code (per Stata sta format doc,
# although this is not followed in actual 117 dtas)
#seek_variable_labels = struct.unpack(
# self.byteorder + 'q', self.path_or_buf.read(8))[0] + 17
self.path_or_buf.read(8) # <characteristics>
self.data_location = struct.unpack(
self.byteorder + 'q', self.path_or_buf.read(8))[0] + 6
Expand Down
Binary file added pandas/io/tests/data/stata7_115.dta
Binary file not shown.
Binary file added pandas/io/tests/data/stata7_117.dta
Binary file not shown.
17 changes: 16 additions & 1 deletion pandas/io/tests/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ def setUp(self):
self.dta15_115 = os.path.join(self.dirpath, 'stata6_115.dta')
self.dta15_117 = os.path.join(self.dirpath, 'stata6_117.dta')

self.dta16_115 = os.path.join(self.dirpath, 'stata7_115.dta')
self.dta16_117 = os.path.join(self.dirpath, 'stata7_117.dta')

def read_dta(self, file):
return read_stata(file, convert_dates=True)

Expand Down Expand Up @@ -199,7 +202,7 @@ def test_read_dta4(self):
'labeled_with_missings', 'float_labelled'])

# these are all categoricals
expected = pd.concat([ Series(pd.Categorical(value)) for col, value in expected.iteritems() ],axis=1)
expected = pd.concat([ Series(pd.Categorical(value)) for col, value in compat.iteritems(expected)],axis=1)

tm.assert_frame_equal(parsed_113, expected)
tm.assert_frame_equal(parsed_114, expected)
Expand Down Expand Up @@ -551,6 +554,18 @@ def test_bool_uint(self):
written_and_read_again = written_and_read_again.set_index('index')
tm.assert_frame_equal(written_and_read_again, expected)

def test_variable_labels(self):
sr_115 = StataReader(self.dta16_115).variable_labels()
sr_117 = StataReader(self.dta16_117).variable_labels()
keys = ('var1', 'var2', 'var3')
labels = ('label1', 'label2', 'label3')
for k,v in compat.iteritems(sr_115):
self.assertTrue(k in sr_117)
self.assertTrue(v == sr_117[k])
self.assertTrue(k in keys)
self.assertTrue(v in labels)


if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)
Expand Down