From 6265450003aa5b5b818d7cd063efcdfb9148cf70 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Tue, 22 Jul 2014 18:23:22 +0100 Subject: [PATCH] BUG: Fixed failure in StataReader when reading variable labels in 117 files Stata's implementation does not match the online dta file format description. The solution used here is to directly compute the offset rather than reading it from the dta file. If Stata fixes their implementation, the original code can be restored. closes #7816 --- doc/source/v0.15.0.txt | 2 +- pandas/io/stata.py | 11 +++++++++-- pandas/io/tests/data/stata7_115.dta | Bin 0 -> 722 bytes pandas/io/tests/data/stata7_117.dta | Bin 0 -> 1159 bytes pandas/io/tests/test_stata.py | 17 ++++++++++++++++- 5 files changed, 26 insertions(+), 4 deletions(-) create mode 100644 pandas/io/tests/data/stata7_115.dta create mode 100644 pandas/io/tests/data/stata7_117.dta diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 06c93541a7783..2322af4752e2e 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -212,7 +212,7 @@ Bug Fixes - Bug in ``DataFrame.plot`` with ``subplots=True`` may draw unnecessary minor xticks and yticks (:issue:`7801`) - +- Bug in ``StataReader`` which did not read variable labels in 117 files due to difference between Stata documentation and implementation (:issue:`7816`) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 48a5f5ee6c994..3458a95ac096d 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -520,8 +520,15 @@ def _read_header(self): self.byteorder + 'q', self.path_or_buf.read(8))[0] + 9 seek_value_label_names = struct.unpack( self.byteorder + 'q', self.path_or_buf.read(8))[0] + 19 - seek_variable_labels = struct.unpack( - self.byteorder + 'q', self.path_or_buf.read(8))[0] + 17 + # Stata 117 data files do not follow the described format. This is + # a work around that uses the previous label, 33 bytes for each + # variable, 20 for the closing tag and 17 for the opening tag + self.path_or_buf.read(8) # , throw away + seek_variable_labels = seek_value_label_names + (33*self.nvar) + 20 + 17 + # Below is the original, correct code (per Stata sta format doc, + # although this is not followed in actual 117 dtas) + #seek_variable_labels = struct.unpack( + # self.byteorder + 'q', self.path_or_buf.read(8))[0] + 17 self.path_or_buf.read(8) # self.data_location = struct.unpack( self.byteorder + 'q', self.path_or_buf.read(8))[0] + 6 diff --git a/pandas/io/tests/data/stata7_115.dta b/pandas/io/tests/data/stata7_115.dta new file mode 100644 index 0000000000000000000000000000000000000000..133713b201ba8d5f3c8ff716bfa4e61638979da5 GIT binary patch literal 722 zcmXSBVq{=uU}S)RQ(-XR+Sng9taQ!|FYjQ`3KiwqeI z3=DCa1XN&zQxYTrRA3A=!2pjUaVC(Ys->PmIv!ORQmU543?ynG(FTTiSyl{dsVk6^ zn3S3W3|?5&!A!q+Ef6JYS7bYnqd9}F;2j|uNRbGx1PjAMpk9I2lNNnx2v@fk zKywuutW1D57&KXl5EPpX65bLCLpc*b_fmnW(g9nKqRQnksYF4zhuhKq0npqF)K`)6 z0mEo^y*b=4@o^nH{BZc`aI}*7{l2-HNp*+U4(}X3JA8Er)MSMbmp$G}M690%Lb0E5 ziF}WNPMHnuk>jx#3Ly!C(5Ib2Bt$dDq5%mtj1UxU6SB>oWdqfhI_W8m?b)&i2Kr}$ z`@T%MR;*gz4IU;!%u%X!MHc*E!3;osaoD