From 4db9ccb49480426c93f78a5e6fe970d543fdf19c Mon Sep 17 00:00:00 2001 From: lmcindewar Date: Mon, 19 Apr 2021 10:50:05 -0400 Subject: [PATCH 01/17] ENH: option to export df to Stata dataset with value labels GH38454 --- pandas/io/stata.py | 125 ++++++++++++++++++++++++++++++++-- pandas/tests/io/test_stata.py | 36 ++++++++++ 2 files changed, 157 insertions(+), 4 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 1deaa634ce3ae..faef07e9a1d88 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -748,6 +748,71 @@ def generate_value_label(self, byteorder: str) -> bytes: return bio.getvalue() +class StataNonCatValueLabel(StataValueLabel): + """ + Prepare formatted version of value labels + + Parameters + ---------- + labname : str + Value label name + value_labels: Dictionary + Mapping of values to labels + encoding : {"latin-1", "utf-8"} + Encoding to use for value labels. + """ + + def __init__( + self, + labname: str, + value_labels: dict[float | int, str], + encoding: str = "latin-1", + ): + + if encoding not in ("latin-1", "utf-8"): + raise ValueError("Only latin-1 and utf-8 are supported.") + + self.labname = labname + self._encoding = encoding + self.value_labels = [(val, lab) for val, lab in value_labels.items()] + self.value_labels.sort(key=lambda x: x[0]) + + self.text_len = 0 + self.txt: list[bytes] = [] + self.n = 0 + + # Compute lengths and setup lists of offsets and labels + offsets: list[int] = [] + values: list[int] = [] + for vl in self.value_labels: + category = vl[1] + if not isinstance(category, str): + category = str(category) + warnings.warn( + value_label_mismatch_doc.format(labname), + ValueLabelTypeMismatch, + ) + category = category.encode(encoding) + offsets.append(self.text_len) + self.text_len += len(category) + 1 # +1 for the padding + values.append(vl[0]) + self.txt.append(category) + self.n += 1 + + if self.text_len > 32000: + raise ValueError( + "Stata value labels for a single variable must " + "have a combined length less than 32,000 characters." + ) + + # Ensure int32 + self.off = np.array(offsets, dtype=np.int32) + self.val = np.array(values, dtype=np.int32) + + # Total length + self.len = 4 + 4 + 4 * self.n + 4 * self.n + self.text_len + + class StataMissingValue: """ An observation's missing value. @@ -2159,6 +2224,10 @@ class StataWriter(StataParser): variable_labels : dict Dictionary containing columns as keys and variable labels as values. Each label must be 80 characters or smaller. + value_labels : dict + Dictionary containing columns as keys and dictionaries of column value + to labels as values. Labels for a single variable must be 32,000 + characters or smaller. compression : str or dict, default 'infer' For on-the-fly compression of the output dta. If string, specifies compression mode. If dict, value at key 'method' specifies compression @@ -2223,6 +2292,7 @@ def __init__( time_stamp: datetime.datetime | None = None, data_label: str | None = None, variable_labels: dict[Hashable, str] | None = None, + value_labels: dict[str, dict[float | int, str]] | None = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ): @@ -2232,6 +2302,8 @@ def __init__( self._time_stamp = time_stamp self._data_label = data_label self._variable_labels = variable_labels + self._non_cat_value_labels = value_labels + self._value_labels: list[StataValueLabel] = [] self._compression = compression self._output_file: Buffer | None = None # attach nobs, nvars, data, varlist, typlist @@ -2259,17 +2331,47 @@ def _write_bytes(self, value: bytes) -> None: """ self.handles.handle.write(value) # type: ignore[arg-type] + def _prepare_non_cat_value_labels(self, data: DataFrame) -> None: + """ + Check for value labels provided for non-categorical columns. Value + labels + """ + self._has_value_labels = np.repeat(False, data.shape[1]) + if self._non_cat_value_labels is None: + return + + for labname, labels in self._non_cat_value_labels.items(): + if labname not in data.columns: + # Value label should apply to a column + raise ValueError( + f"Can't create value labels for {labname}, it wasn't " + "found in the dataset." + ) + if is_categorical_dtype(data[labname].dtype): + # Labels should not be passed explicitly for categorical + # columns that will be converted to int + raise ValueError( + f"Can't create value labels for {labname}, a categorical " + "column. Value labels are created automatically before " + "writing categorical columns." + ) + svl = StataNonCatValueLabel(labname, labels) + self._value_labels.append(svl) + + has_non_cat_val_labels = data.columns.isin(self._non_cat_value_labels.keys()) + self._has_value_labels |= has_non_cat_val_labels + def _prepare_categoricals(self, data: DataFrame) -> DataFrame: """ Check for categorical columns, retain categorical information for Stata file and convert categorical data to int """ is_cat = [is_categorical_dtype(data[col].dtype) for col in data] - self._is_col_cat = is_cat - self._value_labels: list[StataValueLabel] = [] if not any(is_cat): return data + self._has_value_labels |= np.array(is_cat) + get_base_missing_value = StataMissingValue.get_base_missing_value data_formatted = [] for col, col_is_cat in zip(data, is_cat): @@ -2449,6 +2551,9 @@ def _prepare_pandas(self, data: DataFrame) -> None: # Replace NaNs with Stata missing values data = self._replace_nans(data) + # Create value labels for non-categorical data + self._prepare_non_cat_value_labels(data) + # Convert categoricals to int data, and strip labels data = self._prepare_categoricals(data) @@ -2688,7 +2793,7 @@ def _write_value_label_names(self) -> None: # lbllist, 33*nvar, char array for i in range(self.nvar): # Use variable name when categorical - if self._is_col_cat[i]: + if self._has_value_labels[i]: name = self.varlist[i] name = self._null_terminate_str(name) name = _pad_bytes(name[:32], 33) @@ -3041,6 +3146,10 @@ class StataWriter117(StataWriter): variable_labels : dict Dictionary containing columns as keys and variable labels as values. Each label must be 80 characters or smaller. + value_labels : dict + Dictionary containing columns as keys and dictionaries of column value + to labels as values. Labels for a single variable must be 32,000 + characters or smaller. convert_strl : list List of columns names to convert to Stata StrL format. Columns with more than 2045 characters are automatically written as StrL. @@ -3109,6 +3218,7 @@ def __init__( time_stamp: datetime.datetime | None = None, data_label: str | None = None, variable_labels: dict[Hashable, str] | None = None, + value_labels: dict[str, dict[float | int, str]] | None = None, convert_strl: Sequence[Hashable] | None = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, @@ -3127,6 +3237,7 @@ def __init__( time_stamp=time_stamp, data_label=data_label, variable_labels=variable_labels, + value_labels=value_labels, compression=compression, storage_options=storage_options, ) @@ -3272,7 +3383,7 @@ def _write_value_label_names(self) -> None: for i in range(self.nvar): # Use variable name when categorical name = "" # default name - if self._is_col_cat[i]: + if self._has_value_labels[i]: name = self.varlist[i] name = self._null_terminate_str(name) encoded_name = _pad_bytes_new(name[:32].encode(self._encoding), vl_len + 1) @@ -3427,6 +3538,10 @@ class StataWriterUTF8(StataWriter117): variable_labels : dict, default None Dictionary containing columns as keys and variable labels as values. Each label must be 80 characters or smaller. + value_labels : dict + Dictionary containing columns as keys and dictionaries of column value + to labels as values. Labels for a single variable must be 32,000 + characters or smaller. convert_strl : list, default None List of columns names to convert to Stata StrL format. Columns with more than 2045 characters are automatically written as StrL. @@ -3501,6 +3616,7 @@ def __init__( time_stamp: datetime.datetime | None = None, data_label: str | None = None, variable_labels: dict[Hashable, str] | None = None, + value_labels: dict[str, dict[float | int, str]] | None = None, convert_strl: Sequence[Hashable] | None = None, version: int | None = None, compression: CompressionOptions = "infer", @@ -3525,6 +3641,7 @@ def __init__( time_stamp=time_stamp, data_label=data_label, variable_labels=variable_labels, + value_labels=value_labels, convert_strl=convert_strl, compression=compression, storage_options=storage_options, diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 6bf8d23f61937..270ff9ded89b1 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -29,6 +29,7 @@ PossiblePrecisionLoss, StataMissingValue, StataReader, + StataWriter, StataWriterUTF8, ValueLabelTypeMismatch, read_stata, @@ -2048,3 +2049,38 @@ def test_stata_compression(compression_only, read_infer, to_infer): df.to_stata(path, compression=to_compression) result = read_stata(path, compression=read_compression, index_col="index") tm.assert_frame_equal(result, df) + + +def test_non_categorical_value_labels(): + data = DataFrame( + { + "X": [1, 2, 3, 4, 1], + "Y": [7, 7, 9, 8, 10], + "Z": pd.Categorical(["j", "k", "l", "k", "j"]), + } + ) + + with tm.ensure_clean() as path: + value_labels = {"X": {1: "one", 2: "two", 4: "four"}} + expected = {**value_labels, "Z": {0: "j", 1: "k", 2: "l"}} + + writer = StataWriter(path, data, value_labels=value_labels) + writer.write_file() + + reader = StataReader(path) + reader_value_labels = reader.value_labels() + assert reader_value_labels == expected + + msg = "Can't create value labels for notY, it wasn't found in the dataset." + with pytest.raises(ValueError, match=msg): + value_labels = {"notY": {7: "label1", 8: "label2"}} + writer = StataWriter(path, data, value_labels=value_labels) + + msg = ( + "Can't create value labels for Z, a categorical " + "column. Value labels are created automatically before " + "writing categorical columns." + ) + with pytest.raises(ValueError, match=msg): + value_labels = {"Z": {1: "a", 2: "k", 3: "j", 4: "i"}} + writer = StataWriter(path, data, value_labels=value_labels) From 5a3d6d9915ea0a17183f7f939aa9c584baf63c74 Mon Sep 17 00:00:00 2001 From: lmcindewar Date: Mon, 19 Apr 2021 12:42:39 -0400 Subject: [PATCH 02/17] Removing unnecessary list comprehension, flake8 --- pandas/io/stata.py | 10 +++++----- pandas/tests/io/test_stata.py | 5 ++--- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index faef07e9a1d88..83f6d66334377 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -46,6 +46,7 @@ ensure_object, is_categorical_dtype, is_datetime64_dtype, + is_numeric_dtype, ) from pandas import ( @@ -774,7 +775,7 @@ def __init__( self.labname = labname self._encoding = encoding - self.value_labels = [(val, lab) for val, lab in value_labels.items()] + self.value_labels = list(value_labels.items()) self.value_labels.sort(key=lambda x: x[0]) self.text_len = 0 @@ -2347,13 +2348,12 @@ def _prepare_non_cat_value_labels(self, data: DataFrame) -> None: f"Can't create value labels for {labname}, it wasn't " "found in the dataset." ) - if is_categorical_dtype(data[labname].dtype): + if not is_numeric_dtype(data[labname].dtype): # Labels should not be passed explicitly for categorical # columns that will be converted to int raise ValueError( - f"Can't create value labels for {labname}, a categorical " - "column. Value labels are created automatically before " - "writing categorical columns." + f"Can't create value labels for {labname}, value labels " + "can only be applied to numeric columns." ) svl = StataNonCatValueLabel(labname, labels) self._value_labels.append(svl) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 270ff9ded89b1..22e7677d60764 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -2077,9 +2077,8 @@ def test_non_categorical_value_labels(): writer = StataWriter(path, data, value_labels=value_labels) msg = ( - "Can't create value labels for Z, a categorical " - "column. Value labels are created automatically before " - "writing categorical columns." + "Can't create value labels for Z, value labels " + "can only be applied to numeric columns." ) with pytest.raises(ValueError, match=msg): value_labels = {"Z": {1: "a", 2: "k", 3: "j", 4: "i"}} From bbb43f8512acd0c19a42c33368b0158ad8640eb5 Mon Sep 17 00:00:00 2001 From: lmcindewar Date: Mon, 19 Apr 2021 14:44:40 -0400 Subject: [PATCH 03/17] Adding value_labels argument to DataFrame to_stata method --- pandas/core/frame.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index db12129a15ef9..79e0602bc73f4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2378,6 +2378,7 @@ def to_stata( time_stamp: datetime.datetime | None = None, data_label: str | None = None, variable_labels: dict[Hashable, str] | None = None, + value_labels: dict[str, dict[float | int, str]] | None = None, version: int | None = 114, convert_strl: Sequence[Hashable] | None = None, compression: CompressionOptions = "infer", @@ -2420,6 +2421,10 @@ def to_stata( variable_labels : dict Dictionary containing columns as keys and variable labels as values. Each label must be 80 characters or smaller. + value_labels : dict + Dictionary containing columns as keys and dictionaries of column value + to labels as values. Labels for a single variable must be 32,000 + characters or smaller. version : {{114, 117, 118, 119, None}}, default 114 Version to use in the output dta file. Set to None to let pandas decide between 118 or 119 formats depending on the number of @@ -2522,6 +2527,7 @@ def to_stata( data_label=data_label, write_index=write_index, variable_labels=variable_labels, + value_labels=value_labels, compression=compression, storage_options=storage_options, **kwargs, From 533a3d5a94fad499eb8c9c45a0677347dd82b5bf Mon Sep 17 00:00:00 2001 From: lmcindewar Date: Mon, 26 Apr 2021 10:34:48 -0400 Subject: [PATCH 04/17] Updating types and changing ValueError to KeyError for missing column --- pandas/core/frame.py | 4 ++-- pandas/io/stata.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 79e0602bc73f4..21dd893f9484a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2378,7 +2378,7 @@ def to_stata( time_stamp: datetime.datetime | None = None, data_label: str | None = None, variable_labels: dict[Hashable, str] | None = None, - value_labels: dict[str, dict[float | int, str]] | None = None, + value_labels: dict[Hashable, dict[float | int, str]] | None = None, version: int | None = 114, convert_strl: Sequence[Hashable] | None = None, compression: CompressionOptions = "infer", @@ -2421,7 +2421,7 @@ def to_stata( variable_labels : dict Dictionary containing columns as keys and variable labels as values. Each label must be 80 characters or smaller. - value_labels : dict + value_labels : dict of dicts Dictionary containing columns as keys and dictionaries of column value to labels as values. Labels for a single variable must be 32,000 characters or smaller. diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 83f6d66334377..147983cc3330c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -21,6 +21,7 @@ Any, AnyStr, Hashable, + Literal, Sequence, cast, ) @@ -767,7 +768,7 @@ def __init__( self, labname: str, value_labels: dict[float | int, str], - encoding: str = "latin-1", + encoding: Literal["latin-1", "utf-8"] = "latin-1", ): if encoding not in ("latin-1", "utf-8"): @@ -2344,7 +2345,7 @@ def _prepare_non_cat_value_labels(self, data: DataFrame) -> None: for labname, labels in self._non_cat_value_labels.items(): if labname not in data.columns: # Value label should apply to a column - raise ValueError( + raise KeyError( f"Can't create value labels for {labname}, it wasn't " "found in the dataset." ) @@ -3616,7 +3617,7 @@ def __init__( time_stamp: datetime.datetime | None = None, data_label: str | None = None, variable_labels: dict[Hashable, str] | None = None, - value_labels: dict[str, dict[float | int, str]] | None = None, + value_labels: dict[Hashable, dict[float | int, str]] | None = None, convert_strl: Sequence[Hashable] | None = None, version: int | None = None, compression: CompressionOptions = "infer", From ed73d69f205fc5f87a11403a06422569657528e8 Mon Sep 17 00:00:00 2001 From: lmcindewar Date: Fri, 30 Apr 2021 14:41:21 -0400 Subject: [PATCH 05/17] Using converted names for invalid Stata variable names --- pandas/io/stata.py | 54 ++++++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 147983cc3330c..eea7217093a90 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2226,10 +2226,6 @@ class StataWriter(StataParser): variable_labels : dict Dictionary containing columns as keys and variable labels as values. Each label must be 80 characters or smaller. - value_labels : dict - Dictionary containing columns as keys and dictionaries of column value - to labels as values. Labels for a single variable must be 32,000 - characters or smaller. compression : str or dict, default 'infer' For on-the-fly compression of the output dta. If string, specifies compression mode. If dict, value at key 'method' specifies compression @@ -2246,6 +2242,11 @@ class StataWriter(StataParser): .. versionadded:: 1.2.0 + value_labels : dict of dicts + Dictionary containing columns as keys and dictionaries of column value + to labels as values. Labels for a single variable must be 32,000 + characters or smaller. + Returns ------- writer : StataWriter instance @@ -2294,9 +2295,10 @@ def __init__( time_stamp: datetime.datetime | None = None, data_label: str | None = None, variable_labels: dict[Hashable, str] | None = None, - value_labels: dict[str, dict[float | int, str]] | None = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, + *, + value_labels: dict[Hashable, dict[float | int, str]] | None = None, ): super().__init__() self._convert_dates = {} if convert_dates is None else convert_dates @@ -2308,6 +2310,7 @@ def __init__( self._value_labels: list[StataValueLabel] = [] self._compression = compression self._output_file: Buffer | None = None + self._converted_names: dict[Hashable, str] = {} # attach nobs, nvars, data, varlist, typlist self._prepare_pandas(data) self.storage_options = storage_options @@ -2317,7 +2320,6 @@ def __init__( self._byteorder = _set_endianness(byteorder) self._fname = fname self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8} - self._converted_names: dict[Hashable, str] = {} def _write(self, to_write: str) -> None: """ @@ -2339,27 +2341,33 @@ def _prepare_non_cat_value_labels(self, data: DataFrame) -> None: labels """ self._has_value_labels = np.repeat(False, data.shape[1]) + labelled_columns = [] if self._non_cat_value_labels is None: return for labname, labels in self._non_cat_value_labels.items(): - if labname not in data.columns: - # Value label should apply to a column + if labname in self._converted_names: + colname = self._converted_names[labname] + elif labname in data.columns: + colname = labname + else: raise KeyError( f"Can't create value labels for {labname}, it wasn't " "found in the dataset." ) - if not is_numeric_dtype(data[labname].dtype): + + if not is_numeric_dtype(data[colname].dtype): # Labels should not be passed explicitly for categorical # columns that will be converted to int raise ValueError( f"Can't create value labels for {labname}, value labels " "can only be applied to numeric columns." ) - svl = StataNonCatValueLabel(labname, labels) + svl = StataNonCatValueLabel(colname, labels) self._value_labels.append(svl) + labelled_columns.append(colname) - has_non_cat_val_labels = data.columns.isin(self._non_cat_value_labels.keys()) + has_non_cat_val_labels = data.columns.isin(labelled_columns) self._has_value_labels |= has_non_cat_val_labels def _prepare_categoricals(self, data: DataFrame) -> DataFrame: @@ -3147,10 +3155,6 @@ class StataWriter117(StataWriter): variable_labels : dict Dictionary containing columns as keys and variable labels as values. Each label must be 80 characters or smaller. - value_labels : dict - Dictionary containing columns as keys and dictionaries of column value - to labels as values. Labels for a single variable must be 32,000 - characters or smaller. convert_strl : list List of columns names to convert to Stata StrL format. Columns with more than 2045 characters are automatically written as StrL. @@ -3169,6 +3173,11 @@ class StataWriter117(StataWriter): .. versionadded:: 1.1.0 + value_labels : dict of dicts + Dictionary containing columns as keys and dictionaries of column value + to labels as values. Labels for a single variable must be 32,000 + characters or smaller. + Returns ------- writer : StataWriter117 instance @@ -3219,10 +3228,11 @@ def __init__( time_stamp: datetime.datetime | None = None, data_label: str | None = None, variable_labels: dict[Hashable, str] | None = None, - value_labels: dict[str, dict[float | int, str]] | None = None, convert_strl: Sequence[Hashable] | None = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, + *, + value_labels: dict[Hashable, dict[float | int, str]] | None = None, ): # Copy to new list since convert_strl might be modified later self._convert_strl: list[Hashable] = [] @@ -3539,10 +3549,6 @@ class StataWriterUTF8(StataWriter117): variable_labels : dict, default None Dictionary containing columns as keys and variable labels as values. Each label must be 80 characters or smaller. - value_labels : dict - Dictionary containing columns as keys and dictionaries of column value - to labels as values. Labels for a single variable must be 32,000 - characters or smaller. convert_strl : list, default None List of columns names to convert to Stata StrL format. Columns with more than 2045 characters are automatically written as StrL. @@ -3565,6 +3571,11 @@ class StataWriterUTF8(StataWriter117): .. versionadded:: 1.1.0 + value_labels : dict of dicts + Dictionary containing columns as keys and dictionaries of column value + to labels as values. Labels for a single variable must be 32,000 + characters or smaller. + Returns ------- StataWriterUTF8 @@ -3617,11 +3628,12 @@ def __init__( time_stamp: datetime.datetime | None = None, data_label: str | None = None, variable_labels: dict[Hashable, str] | None = None, - value_labels: dict[Hashable, dict[float | int, str]] | None = None, convert_strl: Sequence[Hashable] | None = None, version: int | None = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, + *, + value_labels: dict[Hashable, dict[float | int, str]] | None = None, ): if version is None: version = 118 if data.shape[1] <= 32767 else 119 From ae4dca7e7bc6ef4e657bc89b96dcc13fd32e4e09 Mon Sep 17 00:00:00 2001 From: lmcindewar Date: Fri, 30 Apr 2021 14:42:01 -0400 Subject: [PATCH 06/17] Moving value_labels to key word only for to_stata --- pandas/core/frame.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 21dd893f9484a..d811cb329ea9d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2378,11 +2378,12 @@ def to_stata( time_stamp: datetime.datetime | None = None, data_label: str | None = None, variable_labels: dict[Hashable, str] | None = None, - value_labels: dict[Hashable, dict[float | int, str]] | None = None, version: int | None = 114, convert_strl: Sequence[Hashable] | None = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, + *, + value_labels: dict[Hashable, dict[float | int, str]] | None = None, ) -> None: """ Export DataFrame object to Stata dta format. @@ -2421,10 +2422,6 @@ def to_stata( variable_labels : dict Dictionary containing columns as keys and variable labels as values. Each label must be 80 characters or smaller. - value_labels : dict of dicts - Dictionary containing columns as keys and dictionaries of column value - to labels as values. Labels for a single variable must be 32,000 - characters or smaller. version : {{114, 117, 118, 119, None}}, default 114 Version to use in the output dta file. Set to None to let pandas decide between 118 or 119 formats depending on the number of @@ -2468,6 +2465,11 @@ def to_stata( .. versionadded:: 1.2.0 + value_labels : dict of dicts + Dictionary containing columns as keys and dictionaries of column value + to labels as values. Labels for a single variable must be 32,000 + characters or smaller. + Raises ------ NotImplementedError @@ -2527,9 +2529,9 @@ def to_stata( data_label=data_label, write_index=write_index, variable_labels=variable_labels, - value_labels=value_labels, compression=compression, storage_options=storage_options, + value_labels=value_labels, **kwargs, ) writer.write_file() From 8e57e46043166b210f1180698737acb2f68e2783 Mon Sep 17 00:00:00 2001 From: lmcindewar Date: Fri, 30 Apr 2021 18:42:31 -0400 Subject: [PATCH 07/17] Adding tests for invalid Stata names and repeated value labels --- pandas/tests/io/test_stata.py | 85 +++++++++++++++++++++++++++++++++-- 1 file changed, 82 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 22e7677d60764..02cf478c61583 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -2054,14 +2054,18 @@ def test_stata_compression(compression_only, read_infer, to_infer): def test_non_categorical_value_labels(): data = DataFrame( { - "X": [1, 2, 3, 4, 1], + "fully_labelled": [1, 2, 3, 3, 1], + "partially_labelled": [1.0, 2.0, np.nan, 9.0, np.nan], "Y": [7, 7, 9, 8, 10], "Z": pd.Categorical(["j", "k", "l", "k", "j"]), } ) with tm.ensure_clean() as path: - value_labels = {"X": {1: "one", 2: "two", 4: "four"}} + value_labels = { + "fully_labelled": {1: "one", 2: "two", 3: "three"}, + "partially_labelled": {1.0: "one", 2.0: "two"}, + } expected = {**value_labels, "Z": {0: "j", 1: "k", 2: "l"}} writer = StataWriter(path, data, value_labels=value_labels) @@ -2072,7 +2076,7 @@ def test_non_categorical_value_labels(): assert reader_value_labels == expected msg = "Can't create value labels for notY, it wasn't found in the dataset." - with pytest.raises(ValueError, match=msg): + with pytest.raises(KeyError, match=msg): value_labels = {"notY": {7: "label1", 8: "label2"}} writer = StataWriter(path, data, value_labels=value_labels) @@ -2083,3 +2087,78 @@ def test_non_categorical_value_labels(): with pytest.raises(ValueError, match=msg): value_labels = {"Z": {1: "a", 2: "k", 3: "j", 4: "i"}} writer = StataWriter(path, data, value_labels=value_labels) + + +def test_non_categorical_value_label_name_conversion(): + # Check conversion of invalid variable names + data = DataFrame( + { + "invalid~!": [1, 1, 2, 3, 5, 8], # Only alphanumeric and _ + "6_invalid": [1, 1, 2, 3, 5, 8], # Must start with letter or _ + "invalid_name_longer_than_32_characters": [8, 8, 9, 9, 8, 8], # Too long + "aggregate": [2, 5, 5, 6, 6, 9], # Reserved words + (1, 2): [1, 2, 3, 4, 5, 6], # Hashable non-string + } + ) + + value_labels = { + "invalid~!": {1: "label1", 2: "label2"}, + "6_invalid": {1: "label1", 2: "label2"}, + "invalid_name_longer_than_32_characters": {8: "eight", 9: "nine"}, + "aggregate": {5: "five"}, + (1, 2): {3: "three"}, + } + + expected = { + "invalid__": {1: "label1", 2: "label2"}, + "_6_invalid": {1: "label1", 2: "label2"}, + "invalid_name_longer_than_32_char": {8: "eight", 9: "nine"}, + "_aggregate": {5: "five"}, + "_1__2_": {3: "three"}, + } + + with tm.ensure_clean() as path: + with tm.assert_produces_warning(InvalidColumnName): + data.to_stata(path, value_labels=value_labels) + + reader = StataReader(path) + reader_value_labels = reader.value_labels() + assert reader_value_labels == expected + + +def test_non_categorical_value_label_convert_categoricals_error(): + # Mapping more than one value to the same label is valid for Stata + # labels, but can't be read with convert_categoricals=True + value_labels = { + "repeated_labels": {10: "Ten", 20: "More than ten", 40: "More than ten"} + } + + data = DataFrame( + { + "repeated_labels": [10, 10, 20, 20, 40, 40], + } + ) + + with tm.ensure_clean() as path: + data.to_stata(path, value_labels=value_labels) + + reader = StataReader(path, convert_categoricals=False) + reader_value_labels = reader.value_labels() + assert reader_value_labels == value_labels + + col = "repeated_labels" + repeats = "-" * 80 + "\n" + "\n".join(["More than ten"]) + + msg = f""" +Value labels for column {col} are not unique. These cannot be converted to +pandas categoricals. + +Either read the file with `convert_categoricals` set to False or use the +low level interface in `StataReader` to separately read the values and the +value_labels. + +The repeated labels are: +{repeats} +""" + with pytest.raises(ValueError, match=msg): + read_stata(path, convert_categoricals=True) From 70dc88b99584d6a5b19cbf7c9dd67cfb20f54c04 Mon Sep 17 00:00:00 2001 From: lmcindewar Date: Fri, 30 Apr 2021 19:03:30 -0400 Subject: [PATCH 08/17] Fixing Literal import --- pandas/io/stata.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index eea7217093a90..d310c6bcffc82 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -18,10 +18,10 @@ import struct import sys from typing import ( + TYPE_CHECKING, Any, AnyStr, Hashable, - Literal, Sequence, cast, ) @@ -66,6 +66,9 @@ from pandas.io.common import get_handle +if TYPE_CHECKING: + from typing import Literal + _version_error = ( "Version of given Stata file is {version}. pandas supports importing " "versions 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), " From 2796d1f660023380a5e933aa550124a675872160 Mon Sep 17 00:00:00 2001 From: lmcindewar Date: Mon, 3 May 2021 14:52:34 -0400 Subject: [PATCH 09/17] Moving label encoding to method --- pandas/io/stata.py | 46 ++++++++++++++-------------------------------- 1 file changed, 14 insertions(+), 32 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index d310c6bcffc82..d661efb360815 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -665,10 +665,18 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"): categories = catarray.cat.categories self.value_labels = list(zip(np.arange(len(categories)), categories)) self.value_labels.sort(key=lambda x: x[0]) + self.text_len = 0 self.txt: list[bytes] = [] self.n = 0 + self.off = np.array([]) + self.val = np.array([]) + self.len = 0 + + self._prepare_value_labels() + def _prepare_value_labels(self): + """ Encode value labels. """ # Compute lengths and setup lists of offsets and labels offsets: list[int] = [] values: list[int] = [] @@ -677,10 +685,10 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"): if not isinstance(category, str): category = str(category) warnings.warn( - value_label_mismatch_doc.format(catarray.name), + value_label_mismatch_doc.format(self.labname), ValueLabelTypeMismatch, ) - category = category.encode(encoding) + category = category.encode(self._encoding) offsets.append(self.text_len) self.text_len += len(category) + 1 # +1 for the padding values.append(vl[0]) @@ -785,37 +793,11 @@ def __init__( self.text_len = 0 self.txt: list[bytes] = [] self.n = 0 + self.off = np.array([]) + self.val = np.array([]) + self.len = 0 - # Compute lengths and setup lists of offsets and labels - offsets: list[int] = [] - values: list[int] = [] - for vl in self.value_labels: - category = vl[1] - if not isinstance(category, str): - category = str(category) - warnings.warn( - value_label_mismatch_doc.format(labname), - ValueLabelTypeMismatch, - ) - category = category.encode(encoding) - offsets.append(self.text_len) - self.text_len += len(category) + 1 # +1 for the padding - values.append(vl[0]) - self.txt.append(category) - self.n += 1 - - if self.text_len > 32000: - raise ValueError( - "Stata value labels for a single variable must " - "have a combined length less than 32,000 characters." - ) - - # Ensure int32 - self.off = np.array(offsets, dtype=np.int32) - self.val = np.array(values, dtype=np.int32) - - # Total length - self.len = 4 + 4 + 4 * self.n + 4 * self.n + self.text_len + self._prepare_value_labels() class StataMissingValue: From 277896afbf94d8cdc3252efcce39898a639264c2 Mon Sep 17 00:00:00 2001 From: lmcindewar Date: Thu, 6 May 2021 16:38:34 -0400 Subject: [PATCH 10/17] Updates from review: typing, documentation --- pandas/core/frame.py | 2 ++ pandas/io/stata.py | 9 ++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d811cb329ea9d..85cd908efe0dc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2470,6 +2470,8 @@ def to_stata( to labels as values. Labels for a single variable must be 32,000 characters or smaller. + .. versionadded:: 1.3.0 + Raises ------ NotImplementedError diff --git a/pandas/io/stata.py b/pandas/io/stata.py index d661efb360815..ed00f624e3ef3 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -663,7 +663,9 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"): self.labname = catarray.name self._encoding = encoding categories = catarray.cat.categories - self.value_labels = list(zip(np.arange(len(categories)), categories)) + self.value_labels: list[tuple[int | float, str], ...] = list( + zip(np.arange(len(categories)), categories) + ) self.value_labels.sort(key=lambda x: x[0]) self.text_len = 0 @@ -787,8 +789,9 @@ def __init__( self.labname = labname self._encoding = encoding - self.value_labels = list(value_labels.items()) - self.value_labels.sort(key=lambda x: x[0]) + self.value_labels: list[tuple[int | float, str], ...] = sorted( + value_labels.items(), key=lambda x: x[0] + ) self.text_len = 0 self.txt: list[bytes] = [] From c31034d24c97d81074100d2e6241394a02428627 Mon Sep 17 00:00:00 2001 From: lmcindewar Date: Mon, 10 May 2021 21:40:03 -0400 Subject: [PATCH 11/17] Fixing mypy errors --- pandas/io/stata.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index ed00f624e3ef3..4dc564e07be82 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -663,7 +663,7 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"): self.labname = catarray.name self._encoding = encoding categories = catarray.cat.categories - self.value_labels: list[tuple[int | float, str], ...] = list( + self.value_labels: list[tuple[int | float, str]] = list( zip(np.arange(len(categories)), categories) ) self.value_labels.sort(key=lambda x: x[0]) @@ -681,9 +681,9 @@ def _prepare_value_labels(self): """ Encode value labels. """ # Compute lengths and setup lists of offsets and labels offsets: list[int] = [] - values: list[int] = [] + values: list[int | float] = [] for vl in self.value_labels: - category = vl[1] + category: str | bytes = vl[1] if not isinstance(category, str): category = str(category) warnings.warn( @@ -789,7 +789,7 @@ def __init__( self.labname = labname self._encoding = encoding - self.value_labels: list[tuple[int | float, str], ...] = sorted( + self.value_labels: list[tuple[int | float, str]] = sorted( value_labels.items(), key=lambda x: x[0] ) @@ -2337,7 +2337,7 @@ def _prepare_non_cat_value_labels(self, data: DataFrame) -> None: if labname in self._converted_names: colname = self._converted_names[labname] elif labname in data.columns: - colname = labname + colname = str(labname) else: raise KeyError( f"Can't create value labels for {labname}, it wasn't " From 85374fd0508c510ed1479db15ba5da83fcfdc106 Mon Sep 17 00:00:00 2001 From: lmcindewar Date: Thu, 27 May 2021 09:16:38 -0400 Subject: [PATCH 12/17] Clarifying comment on label length --- pandas/io/stata.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 4dc564e07be82..ab5b4ed685657 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2232,8 +2232,8 @@ class StataWriter(StataParser): value_labels : dict of dicts Dictionary containing columns as keys and dictionaries of column value - to labels as values. Labels for a single variable must be 32,000 - characters or smaller. + to labels as values. The combined length of all labels for a single + variable must be 32,000 characters or smaller. Returns ------- @@ -3163,8 +3163,8 @@ class StataWriter117(StataWriter): value_labels : dict of dicts Dictionary containing columns as keys and dictionaries of column value - to labels as values. Labels for a single variable must be 32,000 - characters or smaller. + to labels as values. The combined length of all labels for a single + variable must be 32,000 characters or smaller. Returns ------- @@ -3561,8 +3561,8 @@ class StataWriterUTF8(StataWriter117): value_labels : dict of dicts Dictionary containing columns as keys and dictionaries of column value - to labels as values. Labels for a single variable must be 32,000 - characters or smaller. + to labels as values. The combined length of all labels for a single + variable must be 32,000 characters or smaller. Returns ------- From 4ac27dbcd1d4ee77fbd3286b9cc8f6dfab2c1132 Mon Sep 17 00:00:00 2001 From: lmcindewar Date: Wed, 16 Jun 2021 09:23:12 -0400 Subject: [PATCH 13/17] Removing duplication in value label class and returning labels from prepare_non_cat --- pandas/io/stata.py | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index ab5b4ed685657..f5ad01fa99902 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -668,17 +668,20 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"): ) self.value_labels.sort(key=lambda x: x[0]) + self._prepare_value_labels() + + def _prepare_value_labels(self): + """ Encode value labels. """ + self.text_len = 0 self.txt: list[bytes] = [] self.n = 0 + # Offsets (length of categories), converted to int32 self.off = np.array([]) + # Values, converted to int32 self.val = np.array([]) self.len = 0 - self._prepare_value_labels() - - def _prepare_value_labels(self): - """ Encode value labels. """ # Compute lengths and setup lists of offsets and labels offsets: list[int] = [] values: list[int | float] = [] @@ -792,14 +795,6 @@ def __init__( self.value_labels: list[tuple[int | float, str]] = sorted( value_labels.items(), key=lambda x: x[0] ) - - self.text_len = 0 - self.txt: list[bytes] = [] - self.n = 0 - self.off = np.array([]) - self.val = np.array([]) - self.len = 0 - self._prepare_value_labels() @@ -2296,6 +2291,7 @@ def __init__( self._variable_labels = variable_labels self._non_cat_value_labels = value_labels self._value_labels: list[StataValueLabel] = [] + self._has_value_labels = np.array([], dtype=bool) self._compression = compression self._output_file: Buffer | None = None self._converted_names: dict[Hashable, str] = {} @@ -2323,15 +2319,16 @@ def _write_bytes(self, value: bytes) -> None: """ self.handles.handle.write(value) # type: ignore[arg-type] - def _prepare_non_cat_value_labels(self, data: DataFrame) -> None: + def _prepare_non_cat_value_labels( + self, data: DataFrame + ) -> list[StataNonCatValueLabel]: """ Check for value labels provided for non-categorical columns. Value labels """ - self._has_value_labels = np.repeat(False, data.shape[1]) - labelled_columns = [] + non_cat_value_labels: list[StataNonCatValueLabel] = [] if self._non_cat_value_labels is None: - return + return non_cat_value_labels for labname, labels in self._non_cat_value_labels.items(): if labname in self._converted_names: @@ -2352,11 +2349,8 @@ def _prepare_non_cat_value_labels(self, data: DataFrame) -> None: "can only be applied to numeric columns." ) svl = StataNonCatValueLabel(colname, labels) - self._value_labels.append(svl) - labelled_columns.append(colname) - - has_non_cat_val_labels = data.columns.isin(labelled_columns) - self._has_value_labels |= has_non_cat_val_labels + non_cat_value_labels.append(svl) + return non_cat_value_labels def _prepare_categoricals(self, data: DataFrame) -> DataFrame: """ @@ -2548,8 +2542,16 @@ def _prepare_pandas(self, data: DataFrame) -> None: # Replace NaNs with Stata missing values data = self._replace_nans(data) + # Set all columns to initially unlabelled + self._has_value_labels = np.repeat(False, data.shape[1]) + # Create value labels for non-categorical data - self._prepare_non_cat_value_labels(data) + non_cat_value_labels = self._prepare_non_cat_value_labels(data) + + non_cat_columns = [svl.labname for svl in non_cat_value_labels] + has_non_cat_val_labels = data.columns.isin(non_cat_columns) + self._has_value_labels |= has_non_cat_val_labels + self._value_labels.extend(non_cat_value_labels) # Convert categoricals to int data, and strip labels data = self._prepare_categoricals(data) From d2a55840859055505f889dd12341869b783b1554 Mon Sep 17 00:00:00 2001 From: lmcindewar Date: Mon, 21 Jun 2021 08:07:49 -0400 Subject: [PATCH 14/17] Adding versionaddeds --- pandas/core/frame.py | 2 +- pandas/io/stata.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 85cd908efe0dc..c1a3f4d9298b4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2470,7 +2470,7 @@ def to_stata( to labels as values. Labels for a single variable must be 32,000 characters or smaller. - .. versionadded:: 1.3.0 + .. versionadded:: 1.4.0 Raises ------ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index f5ad01fa99902..35d826c038e62 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2230,6 +2230,8 @@ class StataWriter(StataParser): to labels as values. The combined length of all labels for a single variable must be 32,000 characters or smaller. + .. versionadded:: 1.4.0 + Returns ------- writer : StataWriter instance @@ -3168,6 +3170,8 @@ class StataWriter117(StataWriter): to labels as values. The combined length of all labels for a single variable must be 32,000 characters or smaller. + .. versionadded:: 1.4.0 + Returns ------- writer : StataWriter117 instance @@ -3566,6 +3570,8 @@ class StataWriterUTF8(StataWriter117): to labels as values. The combined length of all labels for a single variable must be 32,000 characters or smaller. + .. versionadded:: 1.4.0 + Returns ------- StataWriterUTF8 From d30d1fe2e5645a3fc4fa92ed6f059759dbce9c31 Mon Sep 17 00:00:00 2001 From: lmcindewar Date: Mon, 21 Jun 2021 08:10:06 -0400 Subject: [PATCH 15/17] Typo in spacing of docstring --- pandas/io/stata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 35d826c038e62..125e669178e69 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -671,7 +671,7 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"): self._prepare_value_labels() def _prepare_value_labels(self): - """ Encode value labels. """ + """Encode value labels.""" self.text_len = 0 self.txt: list[bytes] = [] From 05d4d74aa2cf09496103eada127689985861dd42 Mon Sep 17 00:00:00 2001 From: lmcindewar Date: Mon, 19 Jul 2021 09:32:40 -0400 Subject: [PATCH 16/17] Adding release note --- doc/source/whatsnew/v1.4.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 2f8cb346935a9..d20ca1a4c6e00 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -101,6 +101,8 @@ Other enhancements - :meth:`Series.ewm`, :meth:`DataFrame.ewm`, now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview ` for performance and functional benefits (:issue:`42273`) - :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`) - :meth:`read_table` now supports the argument ``storage_options`` (:issue:`39167`) +- :meth:`DataFrame.to_stata` and :meth:`StataWriter` now accept the keyword only argument ``value_labels`` to save labels for non-categorical columns +- .. --------------------------------------------------------------------------- From 03111426cf4dc25bec4ea81af99081bc8f7b8eea Mon Sep 17 00:00:00 2001 From: lmcindewar Date: Wed, 28 Jul 2021 14:05:49 -0400 Subject: [PATCH 17/17] WSetting data in Statawriter init --- pandas/io/stata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 125e669178e69..11b9e8f7009c4 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2286,6 +2286,7 @@ def __init__( value_labels: dict[Hashable, dict[float | int, str]] | None = None, ): super().__init__() + self.data = data self._convert_dates = {} if convert_dates is None else convert_dates self._write_index = write_index self._time_stamp = time_stamp