diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 06c93541a7783..2322af4752e2e 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -212,7 +212,7 @@ Bug Fixes - Bug in ``DataFrame.plot`` with ``subplots=True`` may draw unnecessary minor xticks and yticks (:issue:`7801`) - +- Bug in ``StataReader`` which did not read variable labels in 117 files due to difference between Stata documentation and implementation (:issue:`7816`) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 48a5f5ee6c994..3458a95ac096d 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -520,8 +520,15 @@ def _read_header(self): self.byteorder + 'q', self.path_or_buf.read(8))[0] + 9 seek_value_label_names = struct.unpack( self.byteorder + 'q', self.path_or_buf.read(8))[0] + 19 - seek_variable_labels = struct.unpack( - self.byteorder + 'q', self.path_or_buf.read(8))[0] + 17 + # Stata 117 data files do not follow the described format. This is + # a work around that uses the previous label, 33 bytes for each + # variable, 20 for the closing tag and 17 for the opening tag + self.path_or_buf.read(8) # , throw away + seek_variable_labels = seek_value_label_names + (33*self.nvar) + 20 + 17 + # Below is the original, correct code (per Stata sta format doc, + # although this is not followed in actual 117 dtas) + #seek_variable_labels = struct.unpack( + # self.byteorder + 'q', self.path_or_buf.read(8))[0] + 17 self.path_or_buf.read(8) # self.data_location = struct.unpack( self.byteorder + 'q', self.path_or_buf.read(8))[0] + 6 diff --git a/pandas/io/tests/data/stata7_115.dta b/pandas/io/tests/data/stata7_115.dta new file mode 100644 index 0000000000000..133713b201ba8 Binary files /dev/null and b/pandas/io/tests/data/stata7_115.dta differ diff --git a/pandas/io/tests/data/stata7_117.dta b/pandas/io/tests/data/stata7_117.dta new file mode 100644 index 0000000000000..c001478fc902d Binary files /dev/null and b/pandas/io/tests/data/stata7_117.dta differ diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 435226bc4313f..5271604235922 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -68,6 +68,9 @@ def setUp(self): self.dta15_115 = os.path.join(self.dirpath, 'stata6_115.dta') self.dta15_117 = os.path.join(self.dirpath, 'stata6_117.dta') + self.dta16_115 = os.path.join(self.dirpath, 'stata7_115.dta') + self.dta16_117 = os.path.join(self.dirpath, 'stata7_117.dta') + def read_dta(self, file): return read_stata(file, convert_dates=True) @@ -199,7 +202,7 @@ def test_read_dta4(self): 'labeled_with_missings', 'float_labelled']) # these are all categoricals - expected = pd.concat([ Series(pd.Categorical(value)) for col, value in expected.iteritems() ],axis=1) + expected = pd.concat([ Series(pd.Categorical(value)) for col, value in compat.iteritems(expected)],axis=1) tm.assert_frame_equal(parsed_113, expected) tm.assert_frame_equal(parsed_114, expected) @@ -551,6 +554,18 @@ def test_bool_uint(self): written_and_read_again = written_and_read_again.set_index('index') tm.assert_frame_equal(written_and_read_again, expected) + def test_variable_labels(self): + sr_115 = StataReader(self.dta16_115).variable_labels() + sr_117 = StataReader(self.dta16_117).variable_labels() + keys = ('var1', 'var2', 'var3') + labels = ('label1', 'label2', 'label3') + for k,v in compat.iteritems(sr_115): + self.assertTrue(k in sr_117) + self.assertTrue(v == sr_117[k]) + self.assertTrue(k in keys) + self.assertTrue(v in labels) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False)