From 1b720e96d9bc5548b7f5bf154121d538794ec12c Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Sat, 28 May 2022 01:14:08 +0200 Subject: [PATCH 1/4] Allow reading SAS files from archives --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/io/sas/sas7bdat.py | 6 +- pandas/io/sas/sas_xport.py | 8 ++- pandas/io/sas/sasreader.py | 100 +++++++++++++++++++------------- pandas/tests/io/sas/test_sas.py | 10 +++- 5 files changed, 83 insertions(+), 42 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 4a0b9a97a9d11..823e483e7adbc 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -152,6 +152,7 @@ Other enhancements - Added ``numeric_only`` argument to :meth:`Resampler.sum`, :meth:`Resampler.prod`, :meth:`Resampler.min`, :meth:`Resampler.max`, :meth:`Resampler.first`, and :meth:`Resampler.last` (:issue:`46442`) - ``times`` argument in :class:`.ExponentialMovingWindow` now accepts ``np.timedelta64`` (:issue:`47003`) - :class:`DataError` and :class:`SpecificationError` are now exposed in ``pandas.errors`` (:issue:`27656`) +- Allow reading compressed SAS files with :func:`read_sas` (e.g., ``.sas7bdat.gz`` files) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index debd686475432..c6e1b74c11dd8 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -26,6 +26,7 @@ import numpy as np from pandas._typing import ( + CompressionOptions, FilePath, ReadBuffer, ) @@ -168,6 +169,7 @@ def __init__( encoding=None, convert_text=True, convert_header_text=True, + compression: CompressionOptions = "infer", ) -> None: self.index = index @@ -195,7 +197,9 @@ def __init__( self._current_row_on_page_index = 0 self._current_row_in_file_index = 0 - self.handles = get_handle(path_or_buf, "rb", is_text=False) + self.handles = get_handle( + path_or_buf, "rb", is_text=False, compression=compression + ) self._path_or_buf = self.handles.handle diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index a64ade2b3c77c..db09983cacfbc 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -17,6 +17,7 @@ import numpy as np from pandas._typing import ( + CompressionOptions, DatetimeNaTType, FilePath, ReadBuffer, @@ -256,6 +257,7 @@ def __init__( index=None, encoding: str | None = "ISO-8859-1", chunksize=None, + compression: CompressionOptions = "infer", ) -> None: self._encoding = encoding @@ -264,7 +266,11 @@ def __init__( self._chunksize = chunksize self.handles = get_handle( - filepath_or_buffer, "rb", encoding=encoding, is_text=False + filepath_or_buffer, + "rb", + encoding=encoding, + is_text=False, + compression=compression, ) self.filepath_or_buffer = self.handles.handle diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index f50fc777f55e9..a9b314318fa4e 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -14,9 +14,16 @@ ) from pandas._typing import ( + CompressionOptions, FilePath, ReadBuffer, ) +from pandas.util._decorators import ( + Appender, + deprecate_nonkeyword_arguments, +) + +from pandas.core.shared_docs import _shared_docs from pandas.io.common import stringify_path @@ -24,6 +31,44 @@ from pandas import DataFrame +_doc_read_sas = r""" +Read SAS files stored as either XPORT or SAS7BDAT format files. + +Parameters +---------- +filepath_or_buffer : str, path object, or file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``read()`` function. The string could be a URL. + Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: + ``file://localhost/path/to/table.sas``. +format : str {{'xport', 'sas7bdat'}} or None + If None, file format is inferred from file extension. If 'xport' or + 'sas7bdat', uses the corresponding format. +index : identifier of index column, defaults to None + Identifier of column that should be used as index of the DataFrame. +encoding : str, default is None + Encoding for text data. If None, text data are stored as raw bytes. +chunksize : int + Read file `chunksize` lines at a time, returns iterator. + + .. versionchanged:: 1.2 + + ``TextFileReader`` is a context manager. +iterator : bool, defaults to False + If True, returns an iterator for reading the file incrementally. + + .. versionchanged:: 1.2 + + ``TextFileReader`` is a context manager. +{decompression_options} + +Returns +------- +DataFrame if iterator=False and chunksize=None, else SAS7BDATReader or XportReader +""" + + # TODO(PY38): replace with Protocol in Python 3.8 class ReaderBase(metaclass=ABCMeta): """ @@ -53,6 +98,7 @@ def read_sas( encoding: str | None = ..., chunksize: int = ..., iterator: bool = ..., + compression: CompressionOptions = ..., ) -> ReaderBase: ... @@ -65,10 +111,17 @@ def read_sas( encoding: str | None = ..., chunksize: None = ..., iterator: bool = ..., + compression: CompressionOptions = ..., ) -> DataFrame | ReaderBase: ... +@deprecate_nonkeyword_arguments( + version=None, allowed_args=["filepath_or_buffer"], stacklevel=2 +) +@Appender( + _doc_read_sas.format(decompression_options=_shared_docs["decompression_options"]) +) def read_sas( filepath_or_buffer: FilePath | ReadBuffer[bytes], format: str | None = None, @@ -76,43 +129,8 @@ def read_sas( encoding: str | None = None, chunksize: int | None = None, iterator: bool = False, + compression: CompressionOptions = "infer", ) -> DataFrame | ReaderBase: - """ - Read SAS files stored as either XPORT or SAS7BDAT format files. - - Parameters - ---------- - filepath_or_buffer : str, path object, or file-like object - String, path object (implementing ``os.PathLike[str]``), or file-like - object implementing a binary ``read()`` function. The string could be a URL. - Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is - expected. A local file could be: - ``file://localhost/path/to/table.sas``. - format : str {'xport', 'sas7bdat'} or None - If None, file format is inferred from file extension. If 'xport' or - 'sas7bdat', uses the corresponding format. - index : identifier of index column, defaults to None - Identifier of column that should be used as index of the DataFrame. - encoding : str, default is None - Encoding for text data. If None, text data are stored as raw bytes. - chunksize : int - Read file `chunksize` lines at a time, returns iterator. - - .. versionchanged:: 1.2 - - ``TextFileReader`` is a context manager. - iterator : bool, defaults to False - If True, returns an iterator for reading the file incrementally. - - .. versionchanged:: 1.2 - - ``TextFileReader`` is a context manager. - - Returns - ------- - DataFrame if iterator=False and chunksize=None, else SAS7BDATReader - or XportReader - """ if format is None: buffer_error_msg = ( "If this is a buffer object rather " @@ -122,12 +140,14 @@ def read_sas( if not isinstance(filepath_or_buffer, str): raise ValueError(buffer_error_msg) fname = filepath_or_buffer.lower() - if fname.endswith(".xpt"): + if ".xpt" in fname: format = "xport" - elif fname.endswith(".sas7bdat"): + elif ".sas7bdat" in fname: format = "sas7bdat" else: - raise ValueError("unable to infer format of SAS file") + raise ValueError( + f"unable to infer format of SAS file from filename: {repr(fname)}" + ) reader: ReaderBase if format.lower() == "xport": @@ -138,6 +158,7 @@ def read_sas( index=index, encoding=encoding, chunksize=chunksize, + compression=compression, ) elif format.lower() == "sas7bdat": from pandas.io.sas.sas7bdat import SAS7BDATReader @@ -147,6 +168,7 @@ def read_sas( index=index, encoding=encoding, chunksize=chunksize, + compression=compression, ) else: raise ValueError("unknown SAS format") diff --git a/pandas/tests/io/sas/test_sas.py b/pandas/tests/io/sas/test_sas.py index 5d2643c20ceb2..1e38baf4fc409 100644 --- a/pandas/tests/io/sas/test_sas.py +++ b/pandas/tests/io/sas/test_sas.py @@ -20,7 +20,15 @@ def test_sas_buffer_format(self): def test_sas_read_no_format_or_extension(self): # see gh-24548 - msg = "unable to infer format of SAS file" + msg = "unable to infer format of SAS file.+" with tm.ensure_clean("test_file_no_extension") as path: with pytest.raises(ValueError, match=msg): read_sas(path) + + +def test_sas_archive(datapath): + fname_uncompressed = datapath("io", "sas", "data", "airline.sas7bdat") + df_uncompressed = read_sas(fname_uncompressed) + fname_compressed = datapath("io", "sas", "data", "airline.sas7bdat.gz") + df_compressed = read_sas(fname_compressed, format="sas7bdat") + tm.assert_frame_equal(df_uncompressed, df_compressed) From b260806159a6af22fa13bacb8e4f67de881ee3b7 Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Sat, 28 May 2022 01:18:16 +0200 Subject: [PATCH 2/4] Add missing file --- pandas/tests/io/sas/data/airline.sas7bdat.gz | Bin 0 -> 1431 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 pandas/tests/io/sas/data/airline.sas7bdat.gz diff --git a/pandas/tests/io/sas/data/airline.sas7bdat.gz b/pandas/tests/io/sas/data/airline.sas7bdat.gz new file mode 100644 index 0000000000000000000000000000000000000000..7b56e492295f420faaf808eac1665e83c845a8b5 GIT binary patch literal 1431 zcmV;I1!(#oiwFozQITQ-17T@$Y-w&~E^}dXH)3RAbO7yF4NOy46h1Ar2z1yY3K7&d z#+Lno1r;^Xyw@m>j*b~H{w!u`-O7Y9=xRY+%q%)Lx4F#C5octsn-VnpjS|BWXFB85 zZ8$Na$iP{KOx9?0BaQY{v~YQIcbueEy|3OL|24kbXGsn5;c3Y7WR6TDD(7T~*yF zf(x;}{#}34vnE+v_xOL1wQwetW9N5vh){`G#@8qN+}`hxBmWCinsF>nMXaD%@QVnm z5DV%@0T%;n{^E9My$1Af*RwFZsAz{;ZwNwK=U=^nEEuszJgfy z*wzskckNbO+xqPQVy#_G0~o_v@2?`P=Xsbsc_d=DJ5D0z?E@GC%TIv#yG7W*^U(LQ z4r3`SL_Xi1#fY`Oa}jXwM=JaxY~*_zXdXrRXhELd^3%xM@lu5B_iWknn+Wgmc;i1e zU@Whj5aYSH3is%_REPV3la&`mcptICS}6`=N!9YTt28|B&RWEJO5=VPp@#F$dSeXg z>e+3`_sM6Rf9pXl#@1WFupVBw+-YNxf6z#!BJ=ZTFZ|}M9=uGlH*|g|h37U8wDt8(Adjed> zc$=f}?0X)wjiKj)1kF4T)e#xEmbDA9k~I1_^(619N#hysY3lX7$$YKEKag*?4zaM> z&-a$7xle+2j@{IdNwws6m4EqBx^EEc-anDm)yI31on(p^?-}nm38&}|N#J#TmBo9G zYEe4{>ocPs?-{APmDf$TCV}=bUV?6(v$n6YaQ-T>A2E&}u`k@4OA~Ru)y2sYT;RMF z+osVyfOnGn#U$(DD+B$3z?~r-_#PVG94o|xJ;!nv>&%3x zu;)Aoqr#peK=8Z!S|AXZr+9*XjSBThvULhOUSTII>{Nvv6h+o@uYJmk4ErLRq=X zQSO3;%(ue!oLz9*Ty|#XvtLoE%kH!9{z2h zzgZet@{E+T$S`_U7a7XE%&nz0w6tFpaRzmOd?Uyw>w&;OYu$en-_@J=`T}>_C;zTo zVWK6wm&sA4V`*$Ks^L-W%#dgmb~=QLAW%*M|It3e`S!h?$n{i{){%}Soklt`%=I!k6lR Date: Sun, 12 Jun 2022 19:30:28 +0200 Subject: [PATCH 3/4] Review feedback --- pandas/io/sas/sasreader.py | 81 ++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 42 deletions(-) diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index a9b314318fa4e..421fcaee26820 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -19,7 +19,7 @@ ReadBuffer, ) from pandas.util._decorators import ( - Appender, + Substitution, deprecate_nonkeyword_arguments, ) @@ -31,44 +31,6 @@ from pandas import DataFrame -_doc_read_sas = r""" -Read SAS files stored as either XPORT or SAS7BDAT format files. - -Parameters ----------- -filepath_or_buffer : str, path object, or file-like object - String, path object (implementing ``os.PathLike[str]``), or file-like - object implementing a binary ``read()`` function. The string could be a URL. - Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is - expected. A local file could be: - ``file://localhost/path/to/table.sas``. -format : str {{'xport', 'sas7bdat'}} or None - If None, file format is inferred from file extension. If 'xport' or - 'sas7bdat', uses the corresponding format. -index : identifier of index column, defaults to None - Identifier of column that should be used as index of the DataFrame. -encoding : str, default is None - Encoding for text data. If None, text data are stored as raw bytes. -chunksize : int - Read file `chunksize` lines at a time, returns iterator. - - .. versionchanged:: 1.2 - - ``TextFileReader`` is a context manager. -iterator : bool, defaults to False - If True, returns an iterator for reading the file incrementally. - - .. versionchanged:: 1.2 - - ``TextFileReader`` is a context manager. -{decompression_options} - -Returns -------- -DataFrame if iterator=False and chunksize=None, else SAS7BDATReader or XportReader -""" - - # TODO(PY38): replace with Protocol in Python 3.8 class ReaderBase(metaclass=ABCMeta): """ @@ -119,9 +81,7 @@ def read_sas( @deprecate_nonkeyword_arguments( version=None, allowed_args=["filepath_or_buffer"], stacklevel=2 ) -@Appender( - _doc_read_sas.format(decompression_options=_shared_docs["decompression_options"]) -) +@Substitution(decompression_options=_shared_docs["decompression_options"]) def read_sas( filepath_or_buffer: FilePath | ReadBuffer[bytes], format: str | None = None, @@ -131,6 +91,43 @@ def read_sas( iterator: bool = False, compression: CompressionOptions = "infer", ) -> DataFrame | ReaderBase: + """ + Read SAS files stored as either XPORT or SAS7BDAT format files. + + Parameters + ---------- + filepath_or_buffer : str, path object, or file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``read()`` function. The string could be a URL. + Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: + ``file://localhost/path/to/table.sas``. + format : str {'xport', 'sas7bdat'} or None + If None, file format is inferred from file extension. If 'xport' or + 'sas7bdat', uses the corresponding format. + index : identifier of index column, defaults to None + Identifier of column that should be used as index of the DataFrame. + encoding : str, default is None + Encoding for text data. If None, text data are stored as raw bytes. + chunksize : int + Read file `chunksize` lines at a time, returns iterator. + + .. versionchanged:: 1.2 + + ``TextFileReader`` is a context manager. + iterator : bool, defaults to False + If True, returns an iterator for reading the file incrementally. + + .. versionchanged:: 1.2 + + ``TextFileReader`` is a context manager. + %(decompression_options)s + + Returns + ------- + DataFrame if iterator=False and chunksize=None, else SAS7BDATReader + or XportReader + """ if format is None: buffer_error_msg = ( "If this is a buffer object rather " From 230628964df2726ab759e6734ea66f9c7f7f6a79 Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Mon, 13 Jun 2022 10:39:29 +0200 Subject: [PATCH 4/4] Fix --- pandas/io/sas/sasreader.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 421fcaee26820..ff50df886e627 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -19,8 +19,8 @@ ReadBuffer, ) from pandas.util._decorators import ( - Substitution, deprecate_nonkeyword_arguments, + doc, ) from pandas.core.shared_docs import _shared_docs @@ -81,7 +81,7 @@ def read_sas( @deprecate_nonkeyword_arguments( version=None, allowed_args=["filepath_or_buffer"], stacklevel=2 ) -@Substitution(decompression_options=_shared_docs["decompression_options"]) +@doc(decompression_options=_shared_docs["decompression_options"]) def read_sas( filepath_or_buffer: FilePath | ReadBuffer[bytes], format: str | None = None, @@ -102,7 +102,7 @@ def read_sas( Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.sas``. - format : str {'xport', 'sas7bdat'} or None + format : str {{'xport', 'sas7bdat'}} or None If None, file format is inferred from file extension. If 'xport' or 'sas7bdat', uses the corresponding format. index : identifier of index column, defaults to None @@ -121,7 +121,7 @@ def read_sas( .. versionchanged:: 1.2 ``TextFileReader`` is a context manager. - %(decompression_options)s + {decompression_options} Returns -------