From b4de86728a239dd07712cb51cee9e38ab4b831c2 Mon Sep 17 00:00:00 2001 From: Chris Charlton Date: Fri, 3 May 2024 19:44:50 +0100 Subject: [PATCH] ENH: Restore support for reading Stata 104 format dta files, and add support for 103 --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/stata.py | 7 ++-- .../tests/io/data/stata/stata-compat-103.dta | Bin 0 -> 650 bytes .../tests/io/data/stata/stata-compat-104.dta | Bin 0 -> 647 bytes .../io/data/stata/stata-compat-be-103.dta | Bin 0 -> 650 bytes .../io/data/stata/stata-compat-be-104.dta | Bin 0 -> 647 bytes pandas/tests/io/data/stata/stata4_103.dta | Bin 0 -> 780 bytes pandas/tests/io/data/stata/stata4_104.dta | Bin 0 -> 770 bytes pandas/tests/io/test_stata.py | 37 ++++++++++++++---- 9 files changed, 34 insertions(+), 11 deletions(-) create mode 100644 pandas/tests/io/data/stata/stata-compat-103.dta create mode 100644 pandas/tests/io/data/stata/stata-compat-104.dta create mode 100644 pandas/tests/io/data/stata/stata-compat-be-103.dta create mode 100644 pandas/tests/io/data/stata/stata-compat-be-104.dta create mode 100644 pandas/tests/io/data/stata/stata4_103.dta create mode 100644 pandas/tests/io/data/stata/stata4_104.dta diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 278971ef88a0f..db545830ec9ef 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -44,6 +44,7 @@ Other enhancements - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) +- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 47d879c022ee6..41e50043610f2 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -91,7 +91,7 @@ _version_error = ( "Version of given Stata file is {version}. pandas supports importing " - "versions 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), " + "versions 103, 104, 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), " "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16)," "and 119 (Stata 15/16, over 32,767 variables)." ) @@ -1393,7 +1393,7 @@ def _get_seek_variable_labels(self) -> int: def _read_old_header(self, first_char: bytes) -> None: self._format_version = int(first_char[0]) - if self._format_version not in [104, 105, 108, 111, 113, 114, 115]: + if self._format_version not in [103, 104, 105, 108, 111, 113, 114, 115]: raise ValueError(_version_error.format(version=self._format_version)) self._set_encoding() self._byteorder = ">" if self._read_int8() == 0x1 else "<" @@ -1405,7 +1405,8 @@ def _read_old_header(self, first_char: bytes) -> None: self._data_label = self._get_data_label() - self._time_stamp = self._get_time_stamp() + if self._format_version >= 105: + self._time_stamp = self._get_time_stamp() # descriptors if self._format_version > 108: diff --git a/pandas/tests/io/data/stata/stata-compat-103.dta b/pandas/tests/io/data/stata/stata-compat-103.dta new file mode 100644 index 0000000000000000000000000000000000000000..adfeb6c672333b3223c6b4afebda5598baea7b5b GIT binary patch literal 650 zcmYdiVr1Z8U}hi;axyb>(o#}7GxJhXD?rLKEufk*4b32|Ok*PmBMmCUkOF6vKv~6x z1~4&nTGh}<&mf&a)dEDqDX5?&M9|OxtOQOKqZ=~HCp!cffja;H|Nr~{|N8%D&z#vi zYbFTnvKo$T1 literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata-compat-be-103.dta b/pandas/tests/io/data/stata/stata-compat-be-103.dta new file mode 100644 index 0000000000000000000000000000000000000000..0e2ef231f91c00f79e5e8c274e139f325fe4e6f2 GIT binary patch literal 650 zcmYdiWMp9AU|?Wi24cJ*Co?lAEhVKhGcP5z0;Dq20;)dK&;M1Xd*;kp z`<*jEV3xy~POy=QNy#axY3Ui6SwJom)RuidK!qpa3L8MeC<>VwSr{1_7#Q~fO|rji N3@8AlsTzq>L>mP?4i>lpG~Df&Et|gNqCu zp3%2Y|0`Q6`aK%bjQ)O1ZcUyy*4k`C)H*9N?q$0}YHR8`maNZ$c)vs!_siRYXz>uu z-3qpwgygf`SkV@TwRhq`yIV>Vw7(}jM_c`x?6b;b8>^S?%+&c%>gsCOlizB7$u_!9 z4knbo2~n$#3R1^d8Z0Sye6h&jL&2peMRxs{myW`N-)OAc*p9QggkuL6+w;N#b5Y^p^ zbvMYA&o;NhEe3J##9;1jDowcjJ)t?;>epnSRVK@=UbZt+=R>Kht5r{atEtI0x=s!z zl)ed(tBwqk=ds3ZVqGVVN4_NIcHX!k6rF+|kfa1)Ij;M_JS6}N)D1z(z@pRuI5U9d zsD{9a0bd`$ry8{&?n#Zggh1T0nlJ%@xJajDDF*JHr?lAPnzzetLhj`!)4}>Qet;oH z7-NDdy#ntv)dJz*qb@7Wk-|3>2MV*7g4(=EzSeu~$kJw02GY(%f*?`8&t-+}yz4W5 NdS>#HWZVC8#&0|9qDTM$ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index d7fb3c0049965..c7a6103fa7026 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -225,11 +225,9 @@ def test_read_dta3(self, file, datapath): tm.assert_frame_equal(parsed, expected) - @pytest.mark.parametrize( - "file", ["stata4_111", "stata4_113", "stata4_114", "stata4_115", "stata4_117"] - ) - def test_read_dta4(self, file, datapath): - file = datapath("io", "data", "stata", f"{file}.dta") + @pytest.mark.parametrize("version", [111, 113, 114, 115, 117]) + def test_read_dta4(self, version, datapath): + file = datapath("io", "data", "stata", f"stata4_{version}.dta") parsed = self.read_dta(file) expected = DataFrame.from_records( @@ -271,11 +269,11 @@ def test_read_dta4(self, file, datapath): # stata doesn't save .category metadata tm.assert_frame_equal(parsed, expected) - @pytest.mark.parametrize("file", ["stata4_105", "stata4_108"]) - def test_readold_dta4(self, file, datapath): + @pytest.mark.parametrize("version", [103, 104, 105, 108]) + def test_readold_dta4(self, version, datapath): # This test is the same as test_read_dta4 above except that the columns # had to be renamed to match the restrictions in older file format - file = datapath("io", "data", "stata", f"{file}.dta") + file = datapath("io", "data", "stata", f"stata4_{version}.dta") parsed = self.read_dta(file) expected = DataFrame.from_records( @@ -2012,6 +2010,18 @@ def test_backward_compat(version, datapath): tm.assert_frame_equal(old_dta, expected, check_dtype=False) +@pytest.mark.parametrize("version", [103, 104]) +def test_backward_compat_nodateconversion(version, datapath): + # The Stata data format prior to 105 did not support a date format + # so read the raw values for comparison + data_base = datapath("io", "data", "stata") + ref = os.path.join(data_base, "stata-compat-118.dta") + old = os.path.join(data_base, f"stata-compat-{version}.dta") + expected = read_stata(ref, convert_dates=False) + old_dta = read_stata(old, convert_dates=False) + tm.assert_frame_equal(old_dta, expected, check_dtype=False) + + @pytest.mark.parametrize("version", [105, 108, 111, 113, 114, 118]) def test_bigendian(version, datapath): ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta") @@ -2021,6 +2031,17 @@ def test_bigendian(version, datapath): tm.assert_frame_equal(big_dta, expected) +@pytest.mark.parametrize("version", [103, 104]) +def test_bigendian_nodateconversion(version, datapath): + # The Stata data format prior to 105 did not support a date format + # so read the raw values for comparison + ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta") + big = datapath("io", "data", "stata", f"stata-compat-be-{version}.dta") + expected = read_stata(ref, convert_dates=False) + big_dta = read_stata(big, convert_dates=False) + tm.assert_frame_equal(big_dta, expected) + + def test_direct_read(datapath, monkeypatch): file_path = datapath("io", "data", "stata", "stata-compat-118.dta")