diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index fb7af00f61534..2add33becd679 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -734,7 +734,7 @@ I/O - :func:`read_html()` no longer ignores all-whitespace ```` within ```` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`) - :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`) - :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`) -- +- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`) Plotting ^^^^^^^^ diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index b2d930c1be5e7..efeb306b618d1 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -614,7 +614,7 @@ def read(self, nrows=None): ns = (self.column_types == b's').sum() self._string_chunk = np.empty((ns, nrows), dtype=np.object) - self._byte_chunk = np.empty((nd, 8 * nrows), dtype=np.uint8) + self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8) self._current_row_in_chunk_index = 0 p = Parser(self) diff --git a/pandas/tests/io/sas/data/cars.sas7bdat b/pandas/tests/io/sas/data/cars.sas7bdat new file mode 100644 index 0000000000000..ca5d3474c36ad Binary files /dev/null and b/pandas/tests/io/sas/data/cars.sas7bdat differ diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 101ee3e619f5b..efde152a918bd 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -183,6 +183,22 @@ def test_date_time(datapath): tm.assert_frame_equal(df, df0) +def test_compact_numerical_values(datapath): + # Regression test for #21616 + fname = datapath("io", "sas", "data", "cars.sas7bdat") + df = pd.read_sas(fname, encoding='latin-1') + # The two columns CYL and WGT in cars.sas7bdat have column + # width < 8 and only contain integral values. + # Test that pandas doesn't corrupt the numbers by adding + # decimals. + result = df['WGT'] + expected = df['WGT'].round() + tm.assert_series_equal(result, expected, check_exact=True) + result = df['CYL'] + expected = df['CYL'].round() + tm.assert_series_equal(result, expected, check_exact=True) + + def test_zero_variables(datapath): # Check if the SAS file has zero variables (PR #18184) fname = datapath("io", "sas", "data", "zero_variables.sas7bdat")