From d407ccafaeab7c1688a0bc68e29694a2d9360c0d Mon Sep 17 00:00:00 2001 From: Derek Sharp Date: Fri, 29 Jul 2022 11:26:31 -0400 Subject: [PATCH 1/8] ENH: Move warnings to error/__init__.py per GH27656 --- doc/source/reference/testing.rst | 4 +++ pandas/errors/__init__.py | 54 ++++++++++++++++++++++++++++++++ pandas/io/stata.py | 28 ++++++----------- pandas/tests/test_errors.py | 4 +++ 4 files changed, 71 insertions(+), 19 deletions(-) diff --git a/doc/source/reference/testing.rst b/doc/source/reference/testing.rst index 338dd87aa8c62..1144c767942d4 100644 --- a/doc/source/reference/testing.rst +++ b/doc/source/reference/testing.rst @@ -27,6 +27,7 @@ Exceptions and warnings errors.AbstractMethodError errors.AccessorRegistrationWarning errors.AttributeConflictWarning + errors.CategoricalConversionWarning errors.ClosedFileError errors.CSSWarning errors.DatabaseError @@ -36,6 +37,7 @@ Exceptions and warnings errors.EmptyDataError errors.IncompatibilityWarning errors.IndexingError + errors.InvalidColumnName errors.InvalidIndexError errors.IntCastingNaNError errors.MergeError @@ -49,6 +51,7 @@ Exceptions and warnings errors.ParserWarning errors.PerformanceWarning errors.PossibleDataLossError + errors.PossiblePrecisionLoss errors.PyperclipException errors.PyperclipWindowsException errors.SettingWithCopyError @@ -57,6 +60,7 @@ Exceptions and warnings errors.UndefinedVariableError errors.UnsortedIndexError errors.UnsupportedFunctionCall + errors.ValueLabelTypeMismatch Bug report function ------------------- diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index e3f7e9d454383..f6ad79e7b946e 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -469,6 +469,60 @@ class DatabaseError(OSError): """ +class PossiblePrecisionLoss(Warning): + """ + Warning is raised when calling to_stata on a column where the column value is + outside or equal to the int64 value and is converted to an float64. + + Examples + -------- + >>> import numpy as np + >>> df = pd.DataFrame({"s": pd.Series([1, 2**53], dtype=np.int64)}) + >>> df.to_stata('test') # doctest: +SKIP + ... # PossiblePrecisionLoss: Column converted from int64 to float64... + """ + + +class ValueLabelTypeMismatch(Warning): + """ + Warning is raised when calling to_stata on a category column where the column + contains non-string values. + + Examples + -------- + >>> df = pd.DataFrame({"categories": pd.Series(["a", 2], dtype="category")}) + >>> df.to_stata('test') # doctest: +SKIP + ... # ValueLabelTypeMismatch: Stata value labels (pandas categories) must be str... + """ + + +class InvalidColumnName(Warning): + """ + Warning is raised when calling to_stata on a dataframe that has a column where + the column contains a non-valid stata name and needs to be converted. + + Examples + -------- + >>> df = pd.DataFrame({"0categories": pd.Series([2, 2])}) + >>> df.to_stata('test') # doctest: +SKIP + ... # InvalidColumnName: Not all pandas column names were valid Stata variable... + """ + + +class CategoricalConversionWarning(Warning): + """ + Warning is raised when reading a partial labeled Stata file using a iterator. + + Examples + -------- + >>> from pandas.io.stata import StataReader + >>> with StataReader('dta_file', chunksize=2) as reader: # doctest: +SKIP + ... for i, block in enumerate(reader): + ... print(i, block)) + ... # CategoricalConversionWarning: One or more series with value labels... + """ + + __all__ = [ "AbstractMethodError", "AccessorRegistrationWarning", diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 80b6db2500d28..75b8489d35f92 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -41,6 +41,12 @@ StorageOptions, WriteBuffer, ) +from pandas.errors import ( + CategoricalConversionWarning, + InvalidColumnName, + PossiblePrecisionLoss, + ValueLabelTypeMismatch, +) from pandas.util._decorators import ( Appender, doc, @@ -493,32 +499,20 @@ def g(x: datetime.datetime) -> int: """ -class PossiblePrecisionLoss(Warning): - pass - - -precision_loss_doc: Final = """ +precision_loss_doc = """ Column converted from {0} to {1}, and some data are outside of the lossless conversion range. This may result in a loss of precision in the saved data. """ -class ValueLabelTypeMismatch(Warning): - pass - - -value_label_mismatch_doc: Final = """ +value_label_mismatch_doc = """ Stata value labels (pandas categories) must be strings. Column {0} contains non-string labels which will be converted to strings. Please check that the Stata data file created has not lost information due to duplicate labels. """ -class InvalidColumnName(Warning): - pass - - -invalid_name_doc: Final = """ +invalid_name_doc = """ Not all pandas column names were valid Stata variable names. The following replacements have been made: @@ -530,10 +524,6 @@ class InvalidColumnName(Warning): """ -class CategoricalConversionWarning(Warning): - pass - - categorical_conversion_warning = """ One or more series with value labels are not fully labeled. Reading this dataset with an iterator results in categorical variable with different diff --git a/pandas/tests/test_errors.py b/pandas/tests/test_errors.py index 187d5399f5985..c6ca51b7763d9 100644 --- a/pandas/tests/test_errors.py +++ b/pandas/tests/test_errors.py @@ -35,6 +35,10 @@ "IncompatibilityWarning", "AttributeConflictWarning", "DatabaseError", + "PossiblePrecisionLoss", + "CategoricalConversionWarning", + "InvalidColumnName", + "ValueLabelTypeMismatch", ], ) def test_exception_importable(exc): From be1ba905ec2e7e55478f19b8212cfa76ef18a3dd Mon Sep 17 00:00:00 2001 From: Derek Sharp Date: Fri, 29 Jul 2022 11:51:17 -0400 Subject: [PATCH 2/8] ENH: update whatsnew line --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index d138ebb9c02a3..3a0a749545612 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -272,7 +272,7 @@ Other enhancements - A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`) - Added ``numeric_only`` argument to :meth:`Resampler.sum`, :meth:`Resampler.prod`, :meth:`Resampler.min`, :meth:`Resampler.max`, :meth:`Resampler.first`, and :meth:`Resampler.last` (:issue:`46442`) - ``times`` argument in :class:`.ExponentialMovingWindow` now accepts ``np.timedelta64`` (:issue:`47003`) -- :class:`.DataError`, :class:`.SpecificationError`, :class:`.SettingWithCopyError`, :class:`.SettingWithCopyWarning`, :class:`.NumExprClobberingError`, :class:`.UndefinedVariableError`, and :class:`.IndexingError` are now exposed in ``pandas.errors`` (:issue:`27656`) +- Various errors and warnings are now exposed in ``pandas.errors`` (:issue:`27656`) - Added ``check_like`` argument to :func:`testing.assert_series_equal` (:issue:`47247`) - Allow reading compressed SAS files with :func:`read_sas` (e.g., ``.sas7bdat.gz`` files) - :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`) From 9748cc31a4702b2afe62ed6ac017bd18f8668d08 Mon Sep 17 00:00:00 2001 From: Derek Sharp Date: Fri, 29 Jul 2022 16:34:58 -0400 Subject: [PATCH 3/8] ENH: add and re-add final --- pandas/io/stata.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 75b8489d35f92..2df843dfe43e5 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -499,20 +499,20 @@ def g(x: datetime.datetime) -> int: """ -precision_loss_doc = """ +precision_loss_doc: Final = """ Column converted from {0} to {1}, and some data are outside of the lossless conversion range. This may result in a loss of precision in the saved data. """ -value_label_mismatch_doc = """ +value_label_mismatch_doc: Final = """ Stata value labels (pandas categories) must be strings. Column {0} contains non-string labels which will be converted to strings. Please check that the Stata data file created has not lost information due to duplicate labels. """ -invalid_name_doc = """ +invalid_name_doc: Final = """ Not all pandas column names were valid Stata variable names. The following replacements have been made: @@ -524,7 +524,7 @@ def g(x: datetime.datetime) -> int: """ -categorical_conversion_warning = """ +categorical_conversion_warning: Final = """ One or more series with value labels are not fully labeled. Reading this dataset with an iterator results in categorical variable with different categories. This occurs since it is not possible to know all possible values From d283a46d626ed2b1b433bc6224fdb77e6300c71e Mon Sep 17 00:00:00 2001 From: Derek Sharp Date: Fri, 29 Jul 2022 16:44:39 -0400 Subject: [PATCH 4/8] ENH: add to __all__ --- pandas/errors/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index f6ad79e7b946e..e7ac1e796fe68 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -527,6 +527,7 @@ class CategoricalConversionWarning(Warning): "AbstractMethodError", "AccessorRegistrationWarning", "AttributeConflictWarning", + "CategoricalConversionWarning", "ClosedFileError", "CSSWarning", "DatabaseError", @@ -536,6 +537,7 @@ class CategoricalConversionWarning(Warning): "EmptyDataError", "IncompatibilityWarning", "IntCastingNaNError", + "InvalidColumnName", "InvalidIndexError", "IndexingError", "MergeError", @@ -549,6 +551,7 @@ class CategoricalConversionWarning(Warning): "ParserWarning", "PerformanceWarning", "PossibleDataLossError", + "PossiblePrecisionLoss", "PyperclipException", "PyperclipWindowsException", "SettingWithCopyError", @@ -557,4 +560,5 @@ class CategoricalConversionWarning(Warning): "UndefinedVariableError", "UnsortedIndexError", "UnsupportedFunctionCall", + "ValueLabelTypeMismatch", ] From 2ca50c8ac48a376cadc3fe334fa6c871f39658d0 Mon Sep 17 00:00:00 2001 From: Derek Sharp Date: Tue, 2 Aug 2022 23:55:10 -0400 Subject: [PATCH 5/8] ENH: apply feedback --- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/errors/__init__.py | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 0d4f93065d8ff..b198a2c9df82c 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -272,7 +272,7 @@ Other enhancements - Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, :meth:`DataFrame.cov`, :meth:`DataFrame.idxmin`, :meth:`DataFrame.idxmax`, :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.GroupBy.var`, :meth:`.GroupBy.std`, :meth:`.GroupBy.sem`, and :meth:`.DataFrameGroupBy.quantile` (:issue:`46560`) - A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`, :issue:`46725`) - Added ``validate`` argument to :meth:`DataFrame.join` (:issue:`46622`) -- A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`) +- :class:`.DataError`, :class:`.SpecificationError`, :class:`.SettingWithCopyError`, :class:`.SettingWithCopyWarning`, :class:`.NumExprClobberingError`, :class:`.UndefinedVariableError`, and :class:`.IndexingError` are now exposed in ``pandas.errors`` (:issue:`27656`) - Added ``numeric_only`` argument to :meth:`Resampler.sum`, :meth:`Resampler.prod`, :meth:`Resampler.min`, :meth:`Resampler.max`, :meth:`Resampler.first`, and :meth:`Resampler.last` (:issue:`46442`) - ``times`` argument in :class:`.ExponentialMovingWindow` now accepts ``np.timedelta64`` (:issue:`47003`) - Various errors and warnings are now exposed in ``pandas.errors`` (:issue:`27656`) diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 947e1eab0f13d..a26a9a05bdf75 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -481,8 +481,10 @@ class DatabaseError(OSError): class PossiblePrecisionLoss(Warning): """ - Warning is raised when calling to_stata on a column where the column value is - outside or equal to the int64 value and is converted to an float64. + Warning raised by to_stata on a column with a value outside or equal to int64. + + When the column value is outside or equal to the int64 value the column is + converted to a float64 dtype. Examples -------- @@ -495,8 +497,7 @@ class PossiblePrecisionLoss(Warning): class ValueLabelTypeMismatch(Warning): """ - Warning is raised when calling to_stata on a category column where the column - contains non-string values. + Warning raised by to_stata on a category column that contains non-string values. Examples -------- @@ -508,8 +509,10 @@ class ValueLabelTypeMismatch(Warning): class InvalidColumnName(Warning): """ - Warning is raised when calling to_stata on a dataframe that has a column where - the column contains a non-valid stata name and needs to be converted. + Warning raised by to_stata the column contains a non-valid stata name. + + Because the column name is invalid Stata variable, the name needs to be + converted. Examples -------- From fb211ad11cbda88cc90ff04759e01ddb2f7b4b10 Mon Sep 17 00:00:00 2001 From: Derek Sharp Date: Wed, 3 Aug 2022 00:03:09 -0400 Subject: [PATCH 6/8] ENH: fix doc string --- pandas/errors/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index a26a9a05bdf75..63d8d4a573828 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -488,7 +488,6 @@ class PossiblePrecisionLoss(Warning): Examples -------- - >>> import numpy as np >>> df = pd.DataFrame({"s": pd.Series([1, 2**53], dtype=np.int64)}) >>> df.to_stata('test') # doctest: +SKIP ... # PossiblePrecisionLoss: Column converted from int64 to float64... From 680f26938eed9498745894e08bac09f0a6a1dffd Mon Sep 17 00:00:00 2001 From: Derek Sharp Date: Wed, 3 Aug 2022 00:09:42 -0400 Subject: [PATCH 7/8] ENH: add additional exception/warnings to rst --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index b198a2c9df82c..b41e6abb16b86 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -272,7 +272,7 @@ Other enhancements - Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, :meth:`DataFrame.cov`, :meth:`DataFrame.idxmin`, :meth:`DataFrame.idxmax`, :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.GroupBy.var`, :meth:`.GroupBy.std`, :meth:`.GroupBy.sem`, and :meth:`.DataFrameGroupBy.quantile` (:issue:`46560`) - A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`, :issue:`46725`) - Added ``validate`` argument to :meth:`DataFrame.join` (:issue:`46622`) -- :class:`.DataError`, :class:`.SpecificationError`, :class:`.SettingWithCopyError`, :class:`.SettingWithCopyWarning`, :class:`.NumExprClobberingError`, :class:`.UndefinedVariableError`, and :class:`.IndexingError` are now exposed in ``pandas.errors`` (:issue:`27656`) +- :class:`.DataError`, :class:`.SpecificationError`, :class:`.SettingWithCopyError`, :class:`.SettingWithCopyWarning`, :class:`.NumExprClobberingError`, :class:`.UndefinedVariableError`, :class:`.IndexingError`, :class:`.PyperclipException`, :class:`.PyperclipWindowsException`, :class:`.CSSWarning`, :class:`.PossibleDataLossError`, :class:`.ClosedFileError`, :class:`.IncompatibilityWarning`, :class:`.AttributeConflictWarning`, :class:`.DatabaseError, :class:`.PossiblePrecisionLoss, :class:`.ValueLabelTypeMismatch, :class:`.InvalidColumnName, and :class:`.CategoricalConversionWarning` are now exposed in ``pandas.errors`` (:issue:`27656`) - Added ``numeric_only`` argument to :meth:`Resampler.sum`, :meth:`Resampler.prod`, :meth:`Resampler.min`, :meth:`Resampler.max`, :meth:`Resampler.first`, and :meth:`Resampler.last` (:issue:`46442`) - ``times`` argument in :class:`.ExponentialMovingWindow` now accepts ``np.timedelta64`` (:issue:`47003`) - Various errors and warnings are now exposed in ``pandas.errors`` (:issue:`27656`) From 9f1ab1404399df081d5a0d12fc82f33d7150d229 Mon Sep 17 00:00:00 2001 From: Derek Sharp Date: Wed, 3 Aug 2022 00:14:02 -0400 Subject: [PATCH 8/8] ENH: fix rst and typo --- doc/source/whatsnew/v1.5.0.rst | 4 ++-- pandas/errors/__init__.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index b41e6abb16b86..3788cb9867dbb 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -272,10 +272,10 @@ Other enhancements - Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, :meth:`DataFrame.cov`, :meth:`DataFrame.idxmin`, :meth:`DataFrame.idxmax`, :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.GroupBy.var`, :meth:`.GroupBy.std`, :meth:`.GroupBy.sem`, and :meth:`.DataFrameGroupBy.quantile` (:issue:`46560`) - A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`, :issue:`46725`) - Added ``validate`` argument to :meth:`DataFrame.join` (:issue:`46622`) -- :class:`.DataError`, :class:`.SpecificationError`, :class:`.SettingWithCopyError`, :class:`.SettingWithCopyWarning`, :class:`.NumExprClobberingError`, :class:`.UndefinedVariableError`, :class:`.IndexingError`, :class:`.PyperclipException`, :class:`.PyperclipWindowsException`, :class:`.CSSWarning`, :class:`.PossibleDataLossError`, :class:`.ClosedFileError`, :class:`.IncompatibilityWarning`, :class:`.AttributeConflictWarning`, :class:`.DatabaseError, :class:`.PossiblePrecisionLoss, :class:`.ValueLabelTypeMismatch, :class:`.InvalidColumnName, and :class:`.CategoricalConversionWarning` are now exposed in ``pandas.errors`` (:issue:`27656`) +- A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`) - Added ``numeric_only`` argument to :meth:`Resampler.sum`, :meth:`Resampler.prod`, :meth:`Resampler.min`, :meth:`Resampler.max`, :meth:`Resampler.first`, and :meth:`Resampler.last` (:issue:`46442`) - ``times`` argument in :class:`.ExponentialMovingWindow` now accepts ``np.timedelta64`` (:issue:`47003`) -- Various errors and warnings are now exposed in ``pandas.errors`` (:issue:`27656`) +- :class:`.DataError`, :class:`.SpecificationError`, :class:`.SettingWithCopyError`, :class:`.SettingWithCopyWarning`, :class:`.NumExprClobberingError`, :class:`.UndefinedVariableError`, :class:`.IndexingError`, :class:`.PyperclipException`, :class:`.PyperclipWindowsException`, :class:`.CSSWarning`, :class:`.PossibleDataLossError`, :class:`.ClosedFileError`, :class:`.IncompatibilityWarning`, :class:`.AttributeConflictWarning`, :class:`.DatabaseError, :class:`.PossiblePrecisionLoss, :class:`.ValueLabelTypeMismatch, :class:`.InvalidColumnName, and :class:`.CategoricalConversionWarning` are now exposed in ``pandas.errors`` (:issue:`27656`) - Added ``check_like`` argument to :func:`testing.assert_series_equal` (:issue:`47247`) - Allow reading compressed SAS files with :func:`read_sas` (e.g., ``.sas7bdat.gz`` files) - :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`) diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 63d8d4a573828..d0c9ef94f4453 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -510,7 +510,7 @@ class InvalidColumnName(Warning): """ Warning raised by to_stata the column contains a non-valid stata name. - Because the column name is invalid Stata variable, the name needs to be + Because the column name is an invalid Stata variable, the name needs to be converted. Examples