From b4de86728a239dd07712cb51cee9e38ab4b831c2 Mon Sep 17 00:00:00 2001
From: Chris Charlton <c.charlton@bristol.ac.uk>
Date: Fri, 3 May 2024 19:44:50 +0100
Subject: [PATCH] ENH: Restore support for reading Stata 104 format dta files,
 and add support for 103

---
 doc/source/whatsnew/v3.0.0.rst                |   1 +
 pandas/io/stata.py                            |   7 ++--
 .../tests/io/data/stata/stata-compat-103.dta  | Bin 0 -> 650 bytes
 .../tests/io/data/stata/stata-compat-104.dta  | Bin 0 -> 647 bytes
 .../io/data/stata/stata-compat-be-103.dta     | Bin 0 -> 650 bytes
 .../io/data/stata/stata-compat-be-104.dta     | Bin 0 -> 647 bytes
 pandas/tests/io/data/stata/stata4_103.dta     | Bin 0 -> 780 bytes
 pandas/tests/io/data/stata/stata4_104.dta     | Bin 0 -> 770 bytes
 pandas/tests/io/test_stata.py                 |  37 ++++++++++++++----
 9 files changed, 34 insertions(+), 11 deletions(-)
 create mode 100644 pandas/tests/io/data/stata/stata-compat-103.dta
 create mode 100644 pandas/tests/io/data/stata/stata-compat-104.dta
 create mode 100644 pandas/tests/io/data/stata/stata-compat-be-103.dta
 create mode 100644 pandas/tests/io/data/stata/stata-compat-be-104.dta
 create mode 100644 pandas/tests/io/data/stata/stata4_103.dta
 create mode 100644 pandas/tests/io/data/stata/stata4_104.dta

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 278971ef88a0f..db545830ec9ef 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -44,6 +44,7 @@ Other enhancements
 - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`)
 - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
 - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
+- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_300.notable_bug_fixes:
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index 47d879c022ee6..41e50043610f2 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -91,7 +91,7 @@
 
 _version_error = (
     "Version of given Stata file is {version}. pandas supports importing "
-    "versions 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), "
+    "versions 103, 104, 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), "
     "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16),"
     "and 119 (Stata 15/16, over 32,767 variables)."
 )
@@ -1393,7 +1393,7 @@ def _get_seek_variable_labels(self) -> int:
 
     def _read_old_header(self, first_char: bytes) -> None:
         self._format_version = int(first_char[0])
-        if self._format_version not in [104, 105, 108, 111, 113, 114, 115]:
+        if self._format_version not in [103, 104, 105, 108, 111, 113, 114, 115]:
             raise ValueError(_version_error.format(version=self._format_version))
         self._set_encoding()
         self._byteorder = ">" if self._read_int8() == 0x1 else "<"
@@ -1405,7 +1405,8 @@ def _read_old_header(self, first_char: bytes) -> None:
 
         self._data_label = self._get_data_label()
 
-        self._time_stamp = self._get_time_stamp()
+        if self._format_version >= 105:
+            self._time_stamp = self._get_time_stamp()
 
         # descriptors
         if self._format_version > 108:
diff --git a/pandas/tests/io/data/stata/stata-compat-103.dta b/pandas/tests/io/data/stata/stata-compat-103.dta
new file mode 100644
index 0000000000000000000000000000000000000000..adfeb6c672333b3223c6b4afebda5598baea7b5b
GIT binary patch
literal 650
zcmYdiVr1Z8U}hi;axyb>(o#}7GxJhXD?rLKEufk*4b32|Ok*PmBMmCUkOF6vKv~6x
z1~4&nTGh}<&mf&a)dEDqDX5?&M9|OxtOQOKqZ=~HCp!cffja;H|Nr~{|N8%D&z#vi
zYbFTnv<Dm5dB!0zDLExIEj=SMiwVh&vuAwvK~$aut873~2})Uv42&#{3=IrGRr`#=
L;4;L_E@U$S@uNW-

literal 0
HcmV?d00001

diff --git a/pandas/tests/io/data/stata/stata-compat-104.dta b/pandas/tests/io/data/stata/stata-compat-104.dta
new file mode 100644
index 0000000000000000000000000000000000000000..9bc3659afd31c0d7debc436450db5b1d1213fa38
GIT binary patch
literal 647
zcmc~`Vr1Z8U}hi;a*{H0(o#}7GxJhXD?rLKEufk*4b32|Ok*PmBMmCUkOF6vKv~6x
z1~4&nTGh}<&mf&a)dEDqDX5?&M9|OxtOQOKqZ=~HCp!cffja;H|NZ}e{r|IP&g`8v
z69jhJgU#za<B*t?oRXTBo{^cwgs|f58J~R+g(txZ8&DL2(iI~k3nN1V15nXEV=%Z3
JF|iBTL;(1>Ko$T1

literal 0
HcmV?d00001

diff --git a/pandas/tests/io/data/stata/stata-compat-be-103.dta b/pandas/tests/io/data/stata/stata-compat-be-103.dta
new file mode 100644
index 0000000000000000000000000000000000000000..0e2ef231f91c00f79e5e8c274e139f325fe4e6f2
GIT binary patch
literal 650
zcmYdiWMp9AU|?Wi24cJ*Co?lAEhVKhGcP5z0;Dq20;)dK&<w)LG&X`T(x5U7DR4#!
zlvQkK024!}RSk{w4AL1?EkGokf(lwf1Pu+qO5jv6x*?-{N<x6~|Nr0r|JVQjzxT|U
zv-Ue@g1{_?Go4^F6O)otQq$5iGP8hOCYUw*e1IxX!c{ha#8Ff-Gcd9+GBhwS?gN@-
Qf7uue9L{us4M3;_00kmJ8UO$Q

literal 0
HcmV?d00001

diff --git a/pandas/tests/io/data/stata/stata-compat-be-104.dta b/pandas/tests/io/data/stata/stata-compat-be-104.dta
new file mode 100644
index 0000000000000000000000000000000000000000..98185d8ce27dcc1da08092f9c5b00a348f1d7ba4
GIT binary patch
literal 647
zcmc~`WMp9AU|?Wi24cJ*Cn+;0EhVKhGcP5z0;Dq20;)dK&<w)LG&X`T(x5U7DR4#!
zlvQkK024!}RSk{w4AL1?EkGokf(lwf1Pu+qO5jv6x*?-{N<x6~|L_0*>;M1Xd*;kp
z`<*jEV3xy~POy=QNy#axY3Ui6SwJom)RuidK!qpa3L8MeC<>VwSr{1_7#Q~fO|rji
N3<eHoy1?2I3IPl|KotN0

literal 0
HcmV?d00001

diff --git a/pandas/tests/io/data/stata/stata4_103.dta b/pandas/tests/io/data/stata/stata4_103.dta
new file mode 100644
index 0000000000000000000000000000000000000000..3c63935e63df9de2919954b538a3786f50b22edc
GIT binary patch
literal 780
zcmchVJ5s|i5QaA)?<z-dlQsjXpo-jZ0S>@8AlsTzq>L>mP?4i>lpG~Df&Et|gNqCu
zp3%2Y|0`Q6`aK%bjQ)O1ZcUyy*4k`C)H*9N?q$0}YHR8`maNZ$c)vs!_siRYXz>uu
z-3qpwgygf`SkV@TwRhq`yIV>Vw7(}jM_c`x?6b;b8>^S?%+&c%>gsCOlizB7$u_!9
z4knbo2~n$#3R1^d<BHhVDbpi961AN-E{H^@p~w43QV{F7?lb2ph(+oKOhGKA2F$Y%
z-cb#iC*juz(We@59Etd(#?l2MKC20PN{E=!DT5S<c%I^fk82Qd!tv#I^A7vGivbO3
zL}QxJl<$LgntO@D!ADzG8kyWT6%x7GOG9nmq+aX2cI;_Wl!2`?Q6NZ`Sp9OC?Y!$V
Pe%_hMosx6^mpy(1DgUBQ

literal 0
HcmV?d00001

diff --git a/pandas/tests/io/data/stata/stata4_104.dta b/pandas/tests/io/data/stata/stata4_104.dta
new file mode 100644
index 0000000000000000000000000000000000000000..c2517355ebff1258584b58f2a2e34b5453c608a9
GIT binary patch
literal 770
zcmcgqOHRWu5PdEEuX==Rx~-rqSk>8Z0S<r?a2iXlT_sKnD|!@;(xdbyFf$GnSyb$Z
zG;jWI?2OBP4@1oG_m>ye6h&jL&2peMRxs{myW`N-)OAc*p9QggkuL6+w;N#b5Y^p^
zbvMYA&o;NhEe3J##9;1jDowcjJ)t?;>epnSRVK@=UbZt+=R>Kht5r{atEtI0x=s!z
zl)ed(tBwqk=ds3ZVqGVVN4_NIcHX!k6rF+|kfa1)Ij;M_JS6}N)D1z(z@pRuI5U9d
zsD{9a0bd`$ry8{&?n#Zggh1T0nlJ%@xJajDDF*JHr?lAPnzzetLhj`!)4}>Qet;oH
z7-NDdy#ntv)dJz*qb@7Wk-|3>2MV*7g4(=EzSeu~$kJw02GY(%f*?`8&t-+}yz4W5
NdS>#HWZVC8#&0|9qDTM$

literal 0
HcmV?d00001

diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
index d7fb3c0049965..c7a6103fa7026 100644
--- a/pandas/tests/io/test_stata.py
+++ b/pandas/tests/io/test_stata.py
@@ -225,11 +225,9 @@ def test_read_dta3(self, file, datapath):
 
         tm.assert_frame_equal(parsed, expected)
 
-    @pytest.mark.parametrize(
-        "file", ["stata4_111", "stata4_113", "stata4_114", "stata4_115", "stata4_117"]
-    )
-    def test_read_dta4(self, file, datapath):
-        file = datapath("io", "data", "stata", f"{file}.dta")
+    @pytest.mark.parametrize("version", [111, 113, 114, 115, 117])
+    def test_read_dta4(self, version, datapath):
+        file = datapath("io", "data", "stata", f"stata4_{version}.dta")
         parsed = self.read_dta(file)
 
         expected = DataFrame.from_records(
@@ -271,11 +269,11 @@ def test_read_dta4(self, file, datapath):
         # stata doesn't save .category metadata
         tm.assert_frame_equal(parsed, expected)
 
-    @pytest.mark.parametrize("file", ["stata4_105", "stata4_108"])
-    def test_readold_dta4(self, file, datapath):
+    @pytest.mark.parametrize("version", [103, 104, 105, 108])
+    def test_readold_dta4(self, version, datapath):
         # This test is the same as test_read_dta4 above except that the columns
         # had to be renamed to match the restrictions in older file format
-        file = datapath("io", "data", "stata", f"{file}.dta")
+        file = datapath("io", "data", "stata", f"stata4_{version}.dta")
         parsed = self.read_dta(file)
 
         expected = DataFrame.from_records(
@@ -2012,6 +2010,18 @@ def test_backward_compat(version, datapath):
     tm.assert_frame_equal(old_dta, expected, check_dtype=False)
 
 
+@pytest.mark.parametrize("version", [103, 104])
+def test_backward_compat_nodateconversion(version, datapath):
+    # The Stata data format prior to 105 did not support a date format
+    # so read the raw values for comparison
+    data_base = datapath("io", "data", "stata")
+    ref = os.path.join(data_base, "stata-compat-118.dta")
+    old = os.path.join(data_base, f"stata-compat-{version}.dta")
+    expected = read_stata(ref, convert_dates=False)
+    old_dta = read_stata(old, convert_dates=False)
+    tm.assert_frame_equal(old_dta, expected, check_dtype=False)
+
+
 @pytest.mark.parametrize("version", [105, 108, 111, 113, 114, 118])
 def test_bigendian(version, datapath):
     ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta")
@@ -2021,6 +2031,17 @@ def test_bigendian(version, datapath):
     tm.assert_frame_equal(big_dta, expected)
 
 
+@pytest.mark.parametrize("version", [103, 104])
+def test_bigendian_nodateconversion(version, datapath):
+    # The Stata data format prior to 105 did not support a date format
+    # so read the raw values for comparison
+    ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta")
+    big = datapath("io", "data", "stata", f"stata-compat-be-{version}.dta")
+    expected = read_stata(ref, convert_dates=False)
+    big_dta = read_stata(big, convert_dates=False)
+    tm.assert_frame_equal(big_dta, expected)
+
+
 def test_direct_read(datapath, monkeypatch):
     file_path = datapath("io", "data", "stata", "stata-compat-118.dta")