diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 19b448a1871c2..0f71b52120a47 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -33,6 +33,7 @@ Other enhancements - :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`) - Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`) - Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`) +- Support reading value labels from Stata 108-format (Stata 6) and earlier files (:issue:`58154`) - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`) - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`) - diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 50ee2d52ee51d..47d879c022ee6 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1122,6 +1122,7 @@ def __init__( # State variables for the file self._close_file: Callable[[], None] | None = None self._column_selector_set = False + self._value_label_dict: dict[str, dict[int, str]] = {} self._value_labels_read = False self._dtype: np.dtype | None = None self._lines_read = 0 @@ -1502,17 +1503,8 @@ def _decode(self, s: bytes) -> str: ) return s.decode("latin-1") - def _read_value_labels(self) -> None: - self._ensure_open() - if self._value_labels_read: - # Don't read twice - return - if self._format_version <= 108: - # Value labels are not supported in version 108 and earlier. - self._value_labels_read = True - self._value_label_dict: dict[str, dict[float, str]] = {} - return - + def _read_new_value_labels(self) -> None: + """Reads value labels with variable length strings (108 and later format)""" if self._format_version >= 117: self._path_or_buf.seek(self._seek_value_labels) else: @@ -1520,9 +1512,6 @@ def _read_value_labels(self) -> None: offset = self._nobs * self._dtype.itemsize self._path_or_buf.seek(self._data_location + offset) - self._value_labels_read = True - self._value_label_dict = {} - while True: if self._format_version >= 117: if self._path_or_buf.read(5) == b" @@ -1530,8 +1519,10 @@ def _read_value_labels(self) -> None: slength = self._path_or_buf.read(4) if not slength: - break # end of value label table (format < 117) - if self._format_version <= 117: + break # end of value label table (format < 117), or end-of-file + if self._format_version == 108: + labname = self._decode(self._path_or_buf.read(9)) + elif self._format_version <= 117: labname = self._decode(self._path_or_buf.read(33)) else: labname = self._decode(self._path_or_buf.read(129)) @@ -1555,8 +1546,45 @@ def _read_value_labels(self) -> None: self._value_label_dict[labname][val[i]] = self._decode( txt[off[i] : end] ) + if self._format_version >= 117: self._path_or_buf.read(6) # + + def _read_old_value_labels(self) -> None: + """Reads value labels with fixed-length strings (105 and earlier format)""" + assert self._dtype is not None + offset = self._nobs * self._dtype.itemsize + self._path_or_buf.seek(self._data_location + offset) + + while True: + if not self._path_or_buf.read(2): + # end-of-file may have been reached, if so stop here + break + + # otherwise back up and read again, taking byteorder into account + self._path_or_buf.seek(-2, os.SEEK_CUR) + n = self._read_uint16() + labname = self._decode(self._path_or_buf.read(9)) + self._path_or_buf.read(1) # padding + codes = np.frombuffer( + self._path_or_buf.read(2 * n), dtype=f"{self._byteorder}i2", count=n + ) + self._value_label_dict[labname] = {} + for i in range(n): + self._value_label_dict[labname][codes[i]] = self._decode( + self._path_or_buf.read(8) + ) + + def _read_value_labels(self) -> None: + self._ensure_open() + if self._value_labels_read: + # Don't read twice + return + + if self._format_version >= 108: + self._read_new_value_labels() + else: + self._read_old_value_labels() self._value_labels_read = True def _read_strls(self) -> None: @@ -1729,7 +1757,7 @@ def read( i, _stata_elapsed_date_to_datetime_vec(data.iloc[:, i], fmt) ) - if convert_categoricals and self._format_version > 108: + if convert_categoricals: data = self._do_convert_categoricals( data, self._value_label_dict, self._lbllist, order_categoricals ) @@ -1845,7 +1873,7 @@ def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFra def _do_convert_categoricals( self, data: DataFrame, - value_label_dict: dict[str, dict[float, str]], + value_label_dict: dict[str, dict[int, str]], lbllist: Sequence[str], order_categoricals: bool, ) -> DataFrame: @@ -1983,7 +2011,7 @@ def variable_labels(self) -> dict[str, str]: self._ensure_open() return dict(zip(self._varlist, self._variable_labels)) - def value_labels(self) -> dict[str, dict[float, str]]: + def value_labels(self) -> dict[str, dict[int, str]]: """ Return a nested dict associating each variable name to its value and label. diff --git a/pandas/tests/io/data/stata/stata4_105.dta b/pandas/tests/io/data/stata/stata4_105.dta new file mode 100644 index 0000000000000..f804c315b344b Binary files /dev/null and b/pandas/tests/io/data/stata/stata4_105.dta differ diff --git a/pandas/tests/io/data/stata/stata4_108.dta b/pandas/tests/io/data/stata/stata4_108.dta new file mode 100644 index 0000000000000..e78c24b319e47 Binary files /dev/null and b/pandas/tests/io/data/stata/stata4_108.dta differ diff --git a/pandas/tests/io/data/stata/stata4_111.dta b/pandas/tests/io/data/stata/stata4_111.dta new file mode 100644 index 0000000000000..b69034174fcfe Binary files /dev/null and b/pandas/tests/io/data/stata/stata4_111.dta differ diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 0cc8018ea6213..a58655d91a417 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -225,7 +225,7 @@ def test_read_dta3(self, file, datapath): tm.assert_frame_equal(parsed, expected) @pytest.mark.parametrize( - "file", ["stata4_113", "stata4_114", "stata4_115", "stata4_117"] + "file", ["stata4_111", "stata4_113", "stata4_114", "stata4_115", "stata4_117"] ) def test_read_dta4(self, file, datapath): file = datapath("io", "data", "stata", f"{file}.dta") @@ -270,6 +270,52 @@ def test_read_dta4(self, file, datapath): # stata doesn't save .category metadata tm.assert_frame_equal(parsed, expected) + @pytest.mark.parametrize("file", ["stata4_105", "stata4_108"]) + def test_readold_dta4(self, file, datapath): + # This test is the same as test_read_dta4 above except that the columns + # had to be renamed to match the restrictions in older file format + file = datapath("io", "data", "stata", f"{file}.dta") + parsed = self.read_dta(file) + + expected = DataFrame.from_records( + [ + ["one", "ten", "one", "one", "one"], + ["two", "nine", "two", "two", "two"], + ["three", "eight", "three", "three", "three"], + ["four", "seven", 4, "four", "four"], + ["five", "six", 5, np.nan, "five"], + ["six", "five", 6, np.nan, "six"], + ["seven", "four", 7, np.nan, "seven"], + ["eight", "three", 8, np.nan, "eight"], + ["nine", "two", 9, np.nan, "nine"], + ["ten", "one", "ten", np.nan, "ten"], + ], + columns=[ + "fulllab", + "fulllab2", + "incmplab", + "misslab", + "floatlab", + ], + ) + + # these are all categoricals + for col in expected: + orig = expected[col].copy() + + categories = np.asarray(expected["fulllab"][orig.notna()]) + if col == "incmplab": + categories = orig + + cat = orig.astype("category")._values + cat = cat.set_categories(categories, ordered=True) + cat.categories.rename(None, inplace=True) + + expected[col] = cat + + # stata doesn't save .category metadata + tm.assert_frame_equal(parsed, expected) + # File containing strls def test_read_dta12(self, datapath): parsed_117 = self.read_dta(datapath("io", "data", "stata", "stata12_117.dta"))