diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 27b16cb706e8d..e621ab2a5b9c5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -546,6 +546,7 @@ I/O - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) +- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) Period ^^^^^^ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 4e7bd160a5a52..9c6cd2faeaa2f 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1600,14 +1600,13 @@ def _read_strls(self) -> None: v_o = self._read_uint64() else: buf = self._path_or_buf.read(12) - # Only tested on little endian file on little endian machine. + # Only tested on little endian machine. v_size = 2 if self._format_version == 118 else 3 if self._byteorder == "<": buf = buf[0:v_size] + buf[4 : (12 - v_size)] else: - # This path may not be correct, impossible to test - buf = buf[0:v_size] + buf[(4 + v_size) :] - v_o = struct.unpack("Q", buf)[0] + buf = buf[4 - v_size : 4] + buf[(4 + v_size) :] + v_o = struct.unpack(f"{self._byteorder}Q", buf)[0] typ = self._read_uint8() length = self._read_uint32() va = self._path_or_buf.read(length) diff --git a/pandas/tests/io/data/stata/stata12_118.dta b/pandas/tests/io/data/stata/stata12_118.dta new file mode 100644 index 0000000000000..87c6d1f063150 Binary files /dev/null and b/pandas/tests/io/data/stata/stata12_118.dta differ diff --git a/pandas/tests/io/data/stata/stata12_119.dta b/pandas/tests/io/data/stata/stata12_119.dta new file mode 100644 index 0000000000000..fa63f0135738e Binary files /dev/null and b/pandas/tests/io/data/stata/stata12_119.dta differ diff --git a/pandas/tests/io/data/stata/stata12_be_117.dta b/pandas/tests/io/data/stata/stata12_be_117.dta new file mode 100644 index 0000000000000..7f84d15fb76d0 Binary files /dev/null and b/pandas/tests/io/data/stata/stata12_be_117.dta differ diff --git a/pandas/tests/io/data/stata/stata12_be_118.dta b/pandas/tests/io/data/stata/stata12_be_118.dta new file mode 100644 index 0000000000000..9ed6f39b0f9b5 Binary files /dev/null and b/pandas/tests/io/data/stata/stata12_be_118.dta differ diff --git a/pandas/tests/io/data/stata/stata12_be_119.dta b/pandas/tests/io/data/stata/stata12_be_119.dta new file mode 100644 index 0000000000000..3c9736d0f3af3 Binary files /dev/null and b/pandas/tests/io/data/stata/stata12_be_119.dta differ diff --git a/pandas/tests/io/data/stata/stata14_119.dta b/pandas/tests/io/data/stata/stata14_119.dta new file mode 100644 index 0000000000000..e64353213b1c9 Binary files /dev/null and b/pandas/tests/io/data/stata/stata14_119.dta differ diff --git a/pandas/tests/io/data/stata/stata14_be_118.dta b/pandas/tests/io/data/stata/stata14_be_118.dta new file mode 100644 index 0000000000000..584ec0984c49e Binary files /dev/null and b/pandas/tests/io/data/stata/stata14_be_118.dta differ diff --git a/pandas/tests/io/data/stata/stata14_be_119.dta b/pandas/tests/io/data/stata/stata14_be_119.dta new file mode 100644 index 0000000000000..09d08f7e992ea Binary files /dev/null and b/pandas/tests/io/data/stata/stata14_be_119.dta differ diff --git a/pandas/tests/io/data/stata/stata16_119.dta b/pandas/tests/io/data/stata/stata16_119.dta new file mode 100644 index 0000000000000..d03c489d4342d Binary files /dev/null and b/pandas/tests/io/data/stata/stata16_119.dta differ diff --git a/pandas/tests/io/data/stata/stata16_be_118.dta b/pandas/tests/io/data/stata/stata16_be_118.dta new file mode 100644 index 0000000000000..bae769c038820 Binary files /dev/null and b/pandas/tests/io/data/stata/stata16_be_118.dta differ diff --git a/pandas/tests/io/data/stata/stata16_be_119.dta b/pandas/tests/io/data/stata/stata16_be_119.dta new file mode 100644 index 0000000000000..e928a9713715d Binary files /dev/null and b/pandas/tests/io/data/stata/stata16_be_119.dta differ diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index a27448a342a19..bf17e62985fe9 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -314,8 +314,19 @@ def test_readold_dta4(self, version, datapath): tm.assert_frame_equal(parsed, expected) # File containing strls - def test_read_dta12(self, datapath): - parsed_117 = self.read_dta(datapath("io", "data", "stata", "stata12_117.dta")) + @pytest.mark.parametrize( + "file", + [ + "stata12_117", + "stata12_be_117", + "stata12_118", + "stata12_be_118", + "stata12_119", + "stata12_be_119", + ], + ) + def test_read_dta_strl(self, file, datapath): + parsed = self.read_dta(datapath("io", "data", "stata", f"{file}.dta")) expected = DataFrame.from_records( [ [1, "abc", "abcdefghi"], @@ -325,10 +336,20 @@ def test_read_dta12(self, datapath): columns=["x", "y", "z"], ) - tm.assert_frame_equal(parsed_117, expected, check_dtype=False) + tm.assert_frame_equal(parsed, expected, check_dtype=False) - def test_read_dta18(self, datapath): - parsed_118 = self.read_dta(datapath("io", "data", "stata", "stata14_118.dta")) + # 117 is not included in this list as it uses ASCII strings + @pytest.mark.parametrize( + "file", + [ + "stata14_118", + "stata14_be_118", + "stata14_119", + "stata14_be_119", + ], + ) + def test_read_dta118_119(self, file, datapath): + parsed_118 = self.read_dta(datapath("io", "data", "stata", f"{file}.dta")) parsed_118["Bytes"] = parsed_118["Bytes"].astype("O") expected = DataFrame.from_records( [ @@ -352,7 +373,7 @@ def test_read_dta18(self, datapath): for col in parsed_118.columns: tm.assert_almost_equal(parsed_118[col], expected[col]) - with StataReader(datapath("io", "data", "stata", "stata14_118.dta")) as rdr: + with StataReader(datapath("io", "data", "stata", f"{file}.dta")) as rdr: vl = rdr.variable_labels() vl_expected = { "Unicode_Cities_Strl": "Here are some strls with Ünicode chars", @@ -1799,8 +1820,18 @@ def test_gzip_writing(self, temp_file): reread = read_stata(gz, index_col="index") tm.assert_frame_equal(df, reread) - def test_unicode_dta_118(self, datapath): - unicode_df = self.read_dta(datapath("io", "data", "stata", "stata16_118.dta")) + # 117 is not included in this list as it uses ASCII strings + @pytest.mark.parametrize( + "file", + [ + "stata16_118", + "stata16_be_118", + "stata16_119", + "stata16_be_119", + ], + ) + def test_unicode_dta_118_119(self, file, datapath): + unicode_df = self.read_dta(datapath("io", "data", "stata", f"{file}.dta")) columns = ["utf8", "latin1", "ascii", "utf8_strl", "ascii_strl"] values = [