pandas-dev · jreback · Jul 23, 2014 · Jul 22, 2014
diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -212,7 +212,7 @@ Bug Fixes
 
 
 - Bug in ``DataFrame.plot`` with ``subplots=True`` may draw unnecessary minor xticks and yticks (:issue:`7801`)
-
+- Bug in ``StataReader`` which did not read variable labels in 117 files due to difference between Stata documentation and implementation (:issue:`7816`)
 
 
 

diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -520,8 +520,15 @@ def _read_header(self):
                 self.byteorder + 'q', self.path_or_buf.read(8))[0] + 9
             seek_value_label_names = struct.unpack(
                 self.byteorder + 'q', self.path_or_buf.read(8))[0] + 19
-            seek_variable_labels = struct.unpack(
-                self.byteorder + 'q', self.path_or_buf.read(8))[0] + 17
+            # Stata 117 data files do not follow the described format.  This is
+            # a work around that uses the previous label, 33 bytes for each
+            # variable, 20 for the closing tag and 17 for the opening tag
+            self.path_or_buf.read(8)  # <variable_lables>, throw away
+            seek_variable_labels = seek_value_label_names + (33*self.nvar) + 20 + 17
+            # Below is the original, correct code (per Stata sta format doc,
+            # although this is not followed in actual 117 dtas)
+            #seek_variable_labels = struct.unpack(
+            #    self.byteorder + 'q', self.path_or_buf.read(8))[0] + 17
             self.path_or_buf.read(8)  # <characteristics>
             self.data_location = struct.unpack(
                 self.byteorder + 'q', self.path_or_buf.read(8))[0] + 6

diff --git a/pandas/io/tests/data/stata7_115.dta b/pandas/io/tests/data/stata7_115.dta
diff --git a/pandas/io/tests/data/stata7_117.dta b/pandas/io/tests/data/stata7_117.dta
diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py
@@ -68,6 +68,9 @@ def setUp(self):
         self.dta15_115 = os.path.join(self.dirpath, 'stata6_115.dta')
         self.dta15_117 = os.path.join(self.dirpath, 'stata6_117.dta')
 
+        self.dta16_115 = os.path.join(self.dirpath, 'stata7_115.dta')
+        self.dta16_117 = os.path.join(self.dirpath, 'stata7_117.dta')
+
     def read_dta(self, file):
         return read_stata(file, convert_dates=True)
 
@@ -199,7 +202,7 @@ def test_read_dta4(self):
                      'labeled_with_missings', 'float_labelled'])
 
         # these are all categoricals
-        expected = pd.concat([ Series(pd.Categorical(value)) for col, value in expected.iteritems() ],axis=1)
+        expected = pd.concat([ Series(pd.Categorical(value)) for col, value in compat.iteritems(expected)],axis=1)
 
         tm.assert_frame_equal(parsed_113, expected)
         tm.assert_frame_equal(parsed_114, expected)
@@ -551,6 +554,18 @@ def test_bool_uint(self):
             written_and_read_again = written_and_read_again.set_index('index')
             tm.assert_frame_equal(written_and_read_again, expected)
 
+    def test_variable_labels(self):
+        sr_115 = StataReader(self.dta16_115).variable_labels()
+        sr_117 = StataReader(self.dta16_117).variable_labels()
+        keys = ('var1', 'var2', 'var3')
+        labels = ('label1', 'label2', 'label3')
+        for k,v in compat.iteritems(sr_115):
+            self.assertTrue(k in sr_117)
+            self.assertTrue(v == sr_117[k])
+            self.assertTrue(k in keys)
+            self.assertTrue(v in labels)
+
+
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -212,7 +212,7 @@ Bug Fixes


		- Bug in ``DataFrame.plot`` with ``subplots=True`` may draw unnecessary minor xticks and yticks (:issue:`7801`)

		- Bug in ``StataReader`` which did not read variable labels in 117 files due to difference between Stata documentation and implementation (:issue:`7816`)



Expand Down