From 5b9cd4b81f284c5fa1248b7178cac754cae23cf8 Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Fri, 17 Jun 2022 12:07:51 +0200 Subject: [PATCH 01/17] Fast byteswap --- pandas/io/sas/sas.pyx | 82 +++++++++++++++++++++++++++++++++++++-- pandas/io/sas/sas7bdat.py | 46 ++++++++++++++++------ 2 files changed, 114 insertions(+), 14 deletions(-) diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index 9fcef64e07133..1bcc0f037b309 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -1,13 +1,19 @@ # cython: profile=False # cython: boundscheck=False, initializedcheck=False from cython cimport Py_ssize_t +from libc.stdint cimport ( + int64_t, + uint8_t, + uint16_t, + uint32_t, + uint64_t, +) +from libc.string cimport memcpy + import numpy as np import pandas.io.sas.sas_constants as const -ctypedef signed long long int64_t -ctypedef unsigned char uint8_t -ctypedef unsigned short uint16_t # rle_decompress decompresses data using a Run Length Encoding # algorithm. It is partially documented here: @@ -433,3 +439,73 @@ cdef class Parser: self.current_row_on_page_index += 1 self.current_row_in_chunk_index += 1 self.current_row_in_file_index += 1 + + +def read_float_with_byteswap(const uint8_t *data, bint byteswap): + cdef float res = (data)[0] + if byteswap: + res = _byteswap_float(res) + return res + + +def read_double_with_byteswap(const uint8_t *data, bint byteswap): + cdef double res = (data)[0] + if byteswap: + res = _byteswap_double(res) + return res + + +def read_uint16_with_byteswap(const uint8_t *data, bint byteswap): + cdef uint16_t res = (data)[0] + if byteswap: + res = _byteswap2(res) + return res + + +def read_uint32_with_byteswap(const uint8_t *data, bint byteswap): + cdef uint32_t res = (data)[0] + if byteswap: + res = _byteswap4(res) + return res + + +def read_uint64_with_byteswap(const uint8_t *data, bint byteswap): + cdef uint64_t res = (data)[0] + if byteswap: + res = _byteswap8(res) + return res + + +# Byteswapping +# From https://github.com/WizardMac/ReadStat/blob/master/src/readstat_bits. +# Copyright (c) 2013-2016 Evan Miller, Apache 2 License + +cdef inline uint16_t _byteswap2(uint16_t num): + return ((num & 0xFF00) >> 8) | ((num & 0x00FF) << 8) + + +cdef inline uint32_t _byteswap4(uint32_t num): + num = ((num & 0xFFFF0000) >> 16) | ((num & 0x0000FFFF) << 16) + return ((num & 0xFF00FF00) >> 8) | ((num & 0x00FF00FF) << 8) + + +cdef inline uint64_t _byteswap8(uint64_t num): + num = ((num & 0xFFFFFFFF00000000) >> 32) | ((num & 0x00000000FFFFFFFF) << 32) + num = ((num & 0xFFFF0000FFFF0000) >> 16) | ((num & 0x0000FFFF0000FFFF) << 16) + return ((num & 0xFF00FF00FF00FF00) >> 8) | ((num & 0x00FF00FF00FF00FF) << 8) + + +cdef inline float _byteswap_float(float num): + cdef uint32_t answer = 0 + memcpy(&answer, &num, 4) + answer = _byteswap4(answer) + memcpy(&num, &answer, 4) + return num + + +cdef inline double _byteswap_double(double num): + cdef uint64_t answer = 0 + memcpy(&answer, &num, 8) + answer = _byteswap8(answer) + memcpy(&num, &answer, 8) + return num diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 7282affe1b5e6..31ab35b475374 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -20,7 +20,7 @@ datetime, timedelta, ) -import struct +import sys from typing import cast import numpy as np @@ -42,7 +42,14 @@ ) from pandas.io.common import get_handle -from pandas.io.sas._sas import Parser +from pandas.io.sas._sas import ( + Parser, + read_double_with_byteswap, + read_float_with_byteswap, + read_uint16_with_byteswap, + read_uint32_with_byteswap, + read_uint64_with_byteswap, +) import pandas.io.sas.sas_constants as const from pandas.io.sas.sasreader import ReaderBase @@ -259,8 +266,10 @@ def _get_properties(self) -> None: buf = self._read_bytes(const.endianness_offset, const.endianness_length) if buf == b"\x01": self.byte_order = "<" + self.need_byteswap = sys.byteorder == "big" else: self.byte_order = ">" + self.need_byteswap = sys.byteorder == "little" # Get encoding information buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0] @@ -345,22 +354,37 @@ def __next__(self) -> DataFrame: # Read a single float of the given width (4 or 8). def _read_float(self, offset: int, width: int): - if width not in (4, 8): + if width == 4: + return read_float_with_byteswap( + self._read_bytes(offset, 4), self.need_byteswap + ) + elif width == 8: + return read_double_with_byteswap( + self._read_bytes(offset, 8), self.need_byteswap + ) + else: self.close() raise ValueError("invalid float width") - buf = self._read_bytes(offset, width) - fd = "f" if width == 4 else "d" - return struct.unpack(self.byte_order + fd, buf)[0] # Read a single signed integer of the given width (1, 2, 4 or 8). def _read_int(self, offset: int, width: int) -> int: - if width not in (1, 2, 4, 8): + if width == 1: + return self._read_bytes(offset, 1)[0] + elif width == 2: + return read_uint16_with_byteswap( + self._read_bytes(offset, 2), self.need_byteswap + ) + elif width == 4: + return read_uint32_with_byteswap( + self._read_bytes(offset, 4), self.need_byteswap + ) + elif width == 8: + return read_uint64_with_byteswap( + self._read_bytes(offset, 8), self.need_byteswap + ) + else: self.close() raise ValueError("invalid int width") - buf = self._read_bytes(offset, width) - it = {1: "b", 2: "h", 4: "l", 8: "q"}[width] - iv = struct.unpack(self.byte_order + it, buf)[0] - return iv def _read_bytes(self, offset: int, length: int): if self._cached_page is None: From 17c965f964114682d3b11ff7babc8d7c0db6b381 Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Fri, 17 Jun 2022 15:36:33 +0200 Subject: [PATCH 02/17] Add types --- pandas/io/sas/_sas.pyi | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/io/sas/_sas.pyi b/pandas/io/sas/_sas.pyi index 527193dd71e57..e8b2dce7d3266 100644 --- a/pandas/io/sas/_sas.pyi +++ b/pandas/io/sas/_sas.pyi @@ -3,3 +3,9 @@ from pandas.io.sas.sas7bdat import SAS7BDATReader class Parser: def __init__(self, parser: SAS7BDATReader) -> None: ... def read(self, nrows: int) -> None: ... + +def read_float_with_byteswap(data: bytes, byteswap: bool) -> float: ... +def read_double_with_byteswap(data: bytes, byteswap: bool) -> float: ... +def read_uint16_with_byteswap(data: bytes, byteswap: bool) -> int: ... +def read_uint32_with_byteswap(data: bytes, byteswap: bool) -> int: ... +def read_uint64_with_byteswap(data: bytes, byteswap: bool) -> int: ... From 435a003c916bf5f5c02d8182d30e903d47f4f718 Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Sat, 9 Jul 2022 09:45:17 +0200 Subject: [PATCH 03/17] Review feedback --- pandas/io/sas/sas.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index b9e84672e611a..d75a1688075d0 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -439,6 +439,8 @@ cdef class Parser: self.current_row_in_file_index += 1 +# The following are faster versions of struct.unpack that avoid the overhead of +# Python function calls. They may be called up to (n_rows * n_cols) times. def read_float_with_byteswap(const uint8_t *data, bint byteswap): cdef float res = (data)[0] if byteswap: From 10ab87fb7624c29e8b9aa7b475f983a48ad8af0b Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Sat, 9 Jul 2022 21:44:09 +0200 Subject: [PATCH 04/17] Slightly faster variant (1 less bytes obj construction) --- pandas/io/sas/_sas.pyi | 10 +++++----- pandas/io/sas/sas.pyx | 35 +++++++++++++++++++++++++---------- pandas/io/sas/sas7bdat.py | 27 +++++++++------------------ 3 files changed, 39 insertions(+), 33 deletions(-) diff --git a/pandas/io/sas/_sas.pyi b/pandas/io/sas/_sas.pyi index e8b2dce7d3266..8353a9ee0a890 100644 --- a/pandas/io/sas/_sas.pyi +++ b/pandas/io/sas/_sas.pyi @@ -4,8 +4,8 @@ class Parser: def __init__(self, parser: SAS7BDATReader) -> None: ... def read(self, nrows: int) -> None: ... -def read_float_with_byteswap(data: bytes, byteswap: bool) -> float: ... -def read_double_with_byteswap(data: bytes, byteswap: bool) -> float: ... -def read_uint16_with_byteswap(data: bytes, byteswap: bool) -> int: ... -def read_uint32_with_byteswap(data: bytes, byteswap: bool) -> int: ... -def read_uint64_with_byteswap(data: bytes, byteswap: bool) -> int: ... +def read_float_with_byteswap(data: bytes, offset: int, byteswap: bool) -> float: ... +def read_double_with_byteswap(data: bytes, offset: int, byteswap: bool) -> float: ... +def read_uint16_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ... +def read_uint32_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ... +def read_uint64_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ... diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index d75a1688075d0..a4944cd3bfc3e 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -441,36 +441,51 @@ cdef class Parser: # The following are faster versions of struct.unpack that avoid the overhead of # Python function calls. They may be called up to (n_rows * n_cols) times. -def read_float_with_byteswap(const uint8_t *data, bint byteswap): - cdef float res = (data)[0] +def read_float_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): + assert offset + 4 < len(data) + cdef: + const char *data_ptr = data + float res = ((data_ptr + offset))[0] if byteswap: res = _byteswap_float(res) return res -def read_double_with_byteswap(const uint8_t *data, bint byteswap): - cdef double res = (data)[0] +def read_double_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): + assert offset + 8 < len(data) + cdef: + const char *data_ptr = data + double res = ((data_ptr + offset))[0] if byteswap: res = _byteswap_double(res) return res -def read_uint16_with_byteswap(const uint8_t *data, bint byteswap): - cdef uint16_t res = (data)[0] +def read_uint16_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): + assert offset + 2 < len(data) + cdef: + const char *data_ptr = data + uint16_t res = ((data_ptr + offset))[0] if byteswap: res = _byteswap2(res) return res -def read_uint32_with_byteswap(const uint8_t *data, bint byteswap): - cdef uint32_t res = (data)[0] +def read_uint32_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): + assert offset + 4 < len(data) + cdef: + const char *data_ptr = data + uint32_t res = ((data_ptr + offset))[0] if byteswap: res = _byteswap4(res) return res -def read_uint64_with_byteswap(const uint8_t *data, bint byteswap): - cdef uint64_t res = (data)[0] +def read_uint64_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): + assert offset + 8 < len(data) + cdef: + const char *data_ptr = data + uint64_t res = ((data_ptr + offset))[0] if byteswap: res = _byteswap8(res) return res diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 31ab35b475374..72a05744d0f8e 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -356,11 +356,11 @@ def __next__(self) -> DataFrame: def _read_float(self, offset: int, width: int): if width == 4: return read_float_with_byteswap( - self._read_bytes(offset, 4), self.need_byteswap + self._cached_page, offset, self.need_byteswap ) elif width == 8: return read_double_with_byteswap( - self._read_bytes(offset, 8), self.need_byteswap + self._cached_page, offset, self.need_byteswap ) else: self.close() @@ -372,34 +372,25 @@ def _read_int(self, offset: int, width: int) -> int: return self._read_bytes(offset, 1)[0] elif width == 2: return read_uint16_with_byteswap( - self._read_bytes(offset, 2), self.need_byteswap + self._cached_page, offset, self.need_byteswap ) elif width == 4: return read_uint32_with_byteswap( - self._read_bytes(offset, 4), self.need_byteswap + self._cached_page, offset, self.need_byteswap ) elif width == 8: return read_uint64_with_byteswap( - self._read_bytes(offset, 8), self.need_byteswap + self._cached_page, offset, self.need_byteswap ) else: self.close() raise ValueError("invalid int width") def _read_bytes(self, offset: int, length: int): - if self._cached_page is None: - self._path_or_buf.seek(offset) - buf = self._path_or_buf.read(length) - if len(buf) < length: - self.close() - msg = f"Unable to read {length:d} bytes from file position {offset:d}." - raise ValueError(msg) - return buf - else: - if offset + length > len(self._cached_page): - self.close() - raise ValueError("The cached page is too small.") - return self._cached_page[offset : offset + length] + if offset + length > len(self._cached_page): + self.close() + raise ValueError("The cached page is too small.") + return self._cached_page[offset : offset + length] def _read_and_convert_header_text(self, offset: int, length: int) -> str | bytes: return self._convert_header_text( From ad74f5c38526203559e8d8e63848f1b15b25ec97 Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Sun, 10 Jul 2022 22:22:06 +0200 Subject: [PATCH 05/17] Make MyPy happy? --- pandas/io/sas/sas7bdat.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 72a05744d0f8e..832b36da34952 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -354,6 +354,7 @@ def __next__(self) -> DataFrame: # Read a single float of the given width (4 or 8). def _read_float(self, offset: int, width: int): + assert self._cached_page is not None if width == 4: return read_float_with_byteswap( self._cached_page, offset, self.need_byteswap @@ -368,6 +369,7 @@ def _read_float(self, offset: int, width: int): # Read a single signed integer of the given width (1, 2, 4 or 8). def _read_int(self, offset: int, width: int) -> int: + assert self._cached_page is not None if width == 1: return self._read_bytes(offset, 1)[0] elif width == 2: From 9c5b4b3449a05f8b6f8464e5b96fe63d3a639a6f Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Mon, 11 Jul 2022 08:51:04 +0200 Subject: [PATCH 06/17] Update sas7bdat.py --- pandas/io/sas/sas7bdat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 832b36da34952..f36e63a0e000b 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -389,6 +389,7 @@ def _read_int(self, offset: int, width: int) -> int: raise ValueError("invalid int width") def _read_bytes(self, offset: int, length: int): + assert self._cached_page is not None if offset + length > len(self._cached_page): self.close() raise ValueError("The cached page is too small.") From f3c63f08e9b5d6e54e2c2e527c91042311908f3a Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Thu, 21 Jul 2022 09:10:03 +0200 Subject: [PATCH 07/17] Use intrinsics --- pandas/io/sas/sas.pyx | 42 +++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index d9dc639e38050..3e0474903883d 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -495,35 +495,31 @@ def read_uint64_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): # Byteswapping -# From https://github.com/WizardMac/ReadStat/blob/master/src/readstat_bits. -# Copyright (c) 2013-2016 Evan Miller, Apache 2 License -cdef inline uint16_t _byteswap2(uint16_t num): - return ((num & 0xFF00) >> 8) | ((num & 0x00FF) << 8) - - -cdef inline uint32_t _byteswap4(uint32_t num): - num = ((num & 0xFFFF0000) >> 16) | ((num & 0x0000FFFF) << 16) - return ((num & 0xFF00FF00) >> 8) | ((num & 0x00FF00FF) << 8) - - -cdef inline uint64_t _byteswap8(uint64_t num): - num = ((num & 0xFFFFFFFF00000000) >> 32) | ((num & 0x00000000FFFFFFFF) << 32) - num = ((num & 0xFFFF0000FFFF0000) >> 16) | ((num & 0x0000FFFF0000FFFF) << 16) - return ((num & 0xFF00FF00FF00FF00) >> 8) | ((num & 0x00FF00FF00FF00FF) << 8) +cdef extern from *: + """ + #ifdef _MSC_VER + #define _byteswap2 _byteswap_ushort + #define _byteswap4 _byteswap_ulong + #define _byteswap8 _byteswap_uint64 + #else + #define _byteswap2 __builtin_bswap16 + #define _byteswap4 __builtin_bswap32 + #define _byteswap8 __builtin_bswap64 + #endif + """ + uint16_t _byteswap2(uint16_t) + uint32_t _byteswap4(uint32_t) + uint64_t _byteswap8(uint64_t) cdef inline float _byteswap_float(float num): - cdef uint32_t answer = 0 - memcpy(&answer, &num, 4) - answer = _byteswap4(answer) - memcpy(&num, &answer, 4) + cdef uint32_t *intptr = &num + intptr[0] = _byteswap4(intptr[0]) return num cdef inline double _byteswap_double(double num): - cdef uint64_t answer = 0 - memcpy(&answer, &num, 8) - answer = _byteswap8(answer) - memcpy(&num, &answer, 8) + cdef uint64_t *intptr = &num + intptr[0] = _byteswap8(intptr[0]) return num From c310c0d8889fbf8e9e43e32abb91e289f8c2d779 Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Sat, 10 Sep 2022 17:51:17 +0200 Subject: [PATCH 08/17] Lint --- pandas/io/sas/sas.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index 3e0474903883d..353b24d983bb7 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -8,7 +8,6 @@ from libc.stdint cimport ( uint32_t, uint64_t, ) -from libc.string cimport memcpy import numpy as np From 3b7ba836263ac3d069017cff3968967d6f8ff833 Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Sun, 11 Sep 2022 00:34:20 +0200 Subject: [PATCH 09/17] Add tests + move byteswap to module --- pandas/io/sas/_byteswap.pyi | 5 ++ pandas/io/sas/_sas.pyi | 6 -- pandas/io/sas/byteswap.pyx | 92 ++++++++++++++++++++++++++++ pandas/io/sas/sas.pyx | 85 ------------------------- pandas/io/sas/sas7bdat.py | 4 +- pandas/tests/io/sas/test_byteswap.py | 48 +++++++++++++++ setup.py | 2 + 7 files changed, 149 insertions(+), 93 deletions(-) create mode 100644 pandas/io/sas/_byteswap.pyi create mode 100644 pandas/io/sas/byteswap.pyx create mode 100644 pandas/tests/io/sas/test_byteswap.py diff --git a/pandas/io/sas/_byteswap.pyi b/pandas/io/sas/_byteswap.pyi new file mode 100644 index 0000000000000..bb0dbfc6a50b1 --- /dev/null +++ b/pandas/io/sas/_byteswap.pyi @@ -0,0 +1,5 @@ +def read_float_with_byteswap(data: bytes, offset: int, byteswap: bool) -> float: ... +def read_double_with_byteswap(data: bytes, offset: int, byteswap: bool) -> float: ... +def read_uint16_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ... +def read_uint32_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ... +def read_uint64_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ... diff --git a/pandas/io/sas/_sas.pyi b/pandas/io/sas/_sas.pyi index 8353a9ee0a890..527193dd71e57 100644 --- a/pandas/io/sas/_sas.pyi +++ b/pandas/io/sas/_sas.pyi @@ -3,9 +3,3 @@ from pandas.io.sas.sas7bdat import SAS7BDATReader class Parser: def __init__(self, parser: SAS7BDATReader) -> None: ... def read(self, nrows: int) -> None: ... - -def read_float_with_byteswap(data: bytes, offset: int, byteswap: bool) -> float: ... -def read_double_with_byteswap(data: bytes, offset: int, byteswap: bool) -> float: ... -def read_uint16_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ... -def read_uint32_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ... -def read_uint64_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ... diff --git a/pandas/io/sas/byteswap.pyx b/pandas/io/sas/byteswap.pyx new file mode 100644 index 0000000000000..4620403910274 --- /dev/null +++ b/pandas/io/sas/byteswap.pyx @@ -0,0 +1,92 @@ +""" +The following are faster versions of struct.unpack that avoid the overhead of Python function calls. + +In the SAS7BDAT parser, they may be called up to (n_rows * n_cols) times. +""" +from cython cimport Py_ssize_t +from libc.stdint cimport ( + uint16_t, + uint32_t, + uint64_t, +) + + +def read_float_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): + assert offset + 4 < len(data) + cdef: + const char *data_ptr = data + float res = ((data_ptr + offset))[0] + if byteswap: + res = _byteswap_float(res) + return res + + +def read_double_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): + assert offset + 8 < len(data) + cdef: + const char *data_ptr = data + double res = ((data_ptr + offset))[0] + if byteswap: + res = _byteswap_double(res) + return res + + +def read_uint16_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): + assert offset + 2 < len(data) + cdef: + const char *data_ptr = data + uint16_t res = ((data_ptr + offset))[0] + if byteswap: + res = _byteswap2(res) + return res + + +def read_uint32_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): + assert offset + 4 < len(data) + cdef: + const char *data_ptr = data + uint32_t res = ((data_ptr + offset))[0] + if byteswap: + res = _byteswap4(res) + return res + + +def read_uint64_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): + assert offset + 8 < len(data) + cdef: + const char *data_ptr = data + uint64_t res = ((data_ptr + offset))[0] + if byteswap: + res = _byteswap8(res) + return res + + +# Byteswapping + +cdef extern from *: + """ + #ifdef _MSC_VER + #define _byteswap2 _byteswap_ushort + #define _byteswap4 _byteswap_ulong + #define _byteswap8 _byteswap_uint64 + #else + #define _byteswap2 __builtin_bswap16 + #define _byteswap4 __builtin_bswap32 + #define _byteswap8 __builtin_bswap64 + #endif + """ + uint16_t _byteswap2(uint16_t) + uint32_t _byteswap4(uint32_t) + uint64_t _byteswap8(uint64_t) + + +cdef inline float _byteswap_float(float num): + cdef uint32_t *intptr = &num + intptr[0] = _byteswap4(intptr[0]) + return num + + +cdef inline double _byteswap_double(double num): + cdef uint64_t *intptr = &num + intptr[0] = _byteswap8(intptr[0]) + return num diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index 353b24d983bb7..0194ae4d24ecf 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -5,8 +5,6 @@ from libc.stdint cimport ( int64_t, uint8_t, uint16_t, - uint32_t, - uint64_t, ) import numpy as np @@ -439,86 +437,3 @@ cdef class Parser: self.current_row_on_page_index += 1 self.current_row_in_chunk_index += 1 self.current_row_in_file_index += 1 - - -# The following are faster versions of struct.unpack that avoid the overhead of -# Python function calls. They may be called up to (n_rows * n_cols) times. -def read_float_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): - assert offset + 4 < len(data) - cdef: - const char *data_ptr = data - float res = ((data_ptr + offset))[0] - if byteswap: - res = _byteswap_float(res) - return res - - -def read_double_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): - assert offset + 8 < len(data) - cdef: - const char *data_ptr = data - double res = ((data_ptr + offset))[0] - if byteswap: - res = _byteswap_double(res) - return res - - -def read_uint16_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): - assert offset + 2 < len(data) - cdef: - const char *data_ptr = data - uint16_t res = ((data_ptr + offset))[0] - if byteswap: - res = _byteswap2(res) - return res - - -def read_uint32_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): - assert offset + 4 < len(data) - cdef: - const char *data_ptr = data - uint32_t res = ((data_ptr + offset))[0] - if byteswap: - res = _byteswap4(res) - return res - - -def read_uint64_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): - assert offset + 8 < len(data) - cdef: - const char *data_ptr = data - uint64_t res = ((data_ptr + offset))[0] - if byteswap: - res = _byteswap8(res) - return res - - -# Byteswapping - -cdef extern from *: - """ - #ifdef _MSC_VER - #define _byteswap2 _byteswap_ushort - #define _byteswap4 _byteswap_ulong - #define _byteswap8 _byteswap_uint64 - #else - #define _byteswap2 __builtin_bswap16 - #define _byteswap4 __builtin_bswap32 - #define _byteswap8 __builtin_bswap64 - #endif - """ - uint16_t _byteswap2(uint16_t) - uint32_t _byteswap4(uint32_t) - uint64_t _byteswap8(uint64_t) - - -cdef inline float _byteswap_float(float num): - cdef uint32_t *intptr = &num - intptr[0] = _byteswap4(intptr[0]) - return num - - -cdef inline double _byteswap_double(double num): - cdef uint64_t *intptr = &num - intptr[0] = _byteswap8(intptr[0]) - return num diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index a1069b47d8812..8b03201883d06 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -42,14 +42,14 @@ ) from pandas.io.common import get_handle -from pandas.io.sas._sas import ( - Parser, +from pandas.io.sas._byteswap import ( read_double_with_byteswap, read_float_with_byteswap, read_uint16_with_byteswap, read_uint32_with_byteswap, read_uint64_with_byteswap, ) +from pandas.io.sas._sas import Parser import pandas.io.sas.sas_constants as const from pandas.io.sas.sasreader import ReaderBase diff --git a/pandas/tests/io/sas/test_byteswap.py b/pandas/tests/io/sas/test_byteswap.py new file mode 100644 index 0000000000000..84a6bf1cb6b1e --- /dev/null +++ b/pandas/tests/io/sas/test_byteswap.py @@ -0,0 +1,48 @@ +import struct +import sys + +from hypothesis import ( + assume, + example, + given, + strategies as st, +) +import numpy as np +import pytest + +from pandas.io.sas._byteswap import ( + read_double_with_byteswap, + read_float_with_byteswap, + read_uint16_with_byteswap, + read_uint32_with_byteswap, + read_uint64_with_byteswap, +) + +_swapped_byte_order = {"big": "<", "little": ">"}[sys.byteorder] + + +@given(read_offset=st.integers(0, 11), number=st.integers(min_value=0)) +@example(number=2**16, read_offset=0) +@example(number=2**32, read_offset=0) +@example(number=2**64, read_offset=0) +@pytest.mark.parametrize("int_type", ["H", "I", "Q"]) +@pytest.mark.parametrize("should_byteswap", [True, False]) +def test_int_byteswap(read_offset, number, int_type, should_byteswap): + int_type_nbytes = struct.calcsize(int_type) + assume(number < 2 ** (8 * int_type_nbytes)) + number_bytes = struct.pack(int_type, number) + data = bytearray(np.random.default_rng().bytes(20)) + data[read_offset : read_offset + int_type_nbytes] = number_bytes + read_uintxx_with_byteswap = { + "H": read_uint16_with_byteswap, + "I": read_uint32_with_byteswap, + "Q": read_uint64_with_byteswap, + }[int_type] + output_number = read_uintxx_with_byteswap(bytes(data), read_offset, should_byteswap) + if should_byteswap: + (number_bytes_swapped,) = struct.unpack( + _swapped_byte_order + int_type, number_bytes + ) + assert output_number == number_bytes_swapped + else: + assert output_number == number diff --git a/setup.py b/setup.py index 12e8aa36c3794..3a7a2f3853ce5 100755 --- a/setup.py +++ b/setup.py @@ -226,6 +226,7 @@ class CheckSDist(sdist_class): "pandas/_libs/window/indexers.pyx", "pandas/_libs/writers.pyx", "pandas/io/sas/sas.pyx", + "pandas/io/sas/byteswap.pyx", ] _cpp_pyxfiles = [ @@ -570,6 +571,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): "_libs.window.indexers": {"pyxfile": "_libs/window/indexers"}, "_libs.writers": {"pyxfile": "_libs/writers"}, "io.sas._sas": {"pyxfile": "io/sas/sas"}, + "io.sas._byteswap": {"pyxfile": "io/sas/byteswap"}, } extensions = [] From 53fbce2e5e2e39fdd3024bcd32c51306191e4af4 Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Sun, 11 Sep 2022 01:02:23 +0200 Subject: [PATCH 10/17] Add float tests + refactoring --- pandas/tests/io/sas/test_byteswap.py | 48 +++++++++++++++------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/pandas/tests/io/sas/test_byteswap.py b/pandas/tests/io/sas/test_byteswap.py index 84a6bf1cb6b1e..c11f8406b7803 100644 --- a/pandas/tests/io/sas/test_byteswap.py +++ b/pandas/tests/io/sas/test_byteswap.py @@ -1,6 +1,3 @@ -import struct -import sys - from hypothesis import ( assume, example, @@ -18,31 +15,38 @@ read_uint64_with_byteswap, ) -_swapped_byte_order = {"big": "<", "little": ">"}[sys.byteorder] - @given(read_offset=st.integers(0, 11), number=st.integers(min_value=0)) @example(number=2**16, read_offset=0) @example(number=2**32, read_offset=0) @example(number=2**64, read_offset=0) -@pytest.mark.parametrize("int_type", ["H", "I", "Q"]) +@pytest.mark.parametrize("int_type", [np.uint16, np.uint32, np.uint64]) @pytest.mark.parametrize("should_byteswap", [True, False]) def test_int_byteswap(read_offset, number, int_type, should_byteswap): - int_type_nbytes = struct.calcsize(int_type) - assume(number < 2 ** (8 * int_type_nbytes)) - number_bytes = struct.pack(int_type, number) - data = bytearray(np.random.default_rng().bytes(20)) - data[read_offset : read_offset + int_type_nbytes] = number_bytes - read_uintxx_with_byteswap = { - "H": read_uint16_with_byteswap, - "I": read_uint32_with_byteswap, - "Q": read_uint64_with_byteswap, - }[int_type] - output_number = read_uintxx_with_byteswap(bytes(data), read_offset, should_byteswap) + assume(number < 2 ** (8 * int_type(0).itemsize)) + _test(number, int_type, read_offset, should_byteswap) + + +@given(read_offset=st.integers(0, 11), number=st.floats()) +@pytest.mark.parametrize("float_type", [np.float32, np.float64]) +@pytest.mark.parametrize("should_byteswap", [True, False]) +def test_float_byteswap(read_offset, number, float_type, should_byteswap): + _test(number, float_type, read_offset, should_byteswap) + + +def _test(number, number_type, read_offset, should_byteswap): + number = number_type([number]) + data = np.random.default_rng().integers(0, 256, size=20, dtype="uint8") + data[read_offset : read_offset + number.itemsize] = number.view("uint8") + swap_func = { + np.float32: read_float_with_byteswap, + np.float64: read_double_with_byteswap, + np.uint16: read_uint16_with_byteswap, + np.uint32: read_uint32_with_byteswap, + np.uint64: read_uint64_with_byteswap, + }[type(number[0])] + output_number = swap_func(bytes(data), read_offset, should_byteswap) if should_byteswap: - (number_bytes_swapped,) = struct.unpack( - _swapped_byte_order + int_type, number_bytes - ) - assert output_number == number_bytes_swapped + np.testing.assert_equal(output_number, number.byteswap()) else: - assert output_number == number + np.testing.assert_equal(output_number, number) From 9cbc5beebe14c0a2f11ec827d0803ff557790df2 Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Sun, 11 Sep 2022 01:05:03 +0200 Subject: [PATCH 11/17] Undo unrelated changes --- pandas/io/sas/sas.pyx | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index 0194ae4d24ecf..3ba0067331328 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -1,16 +1,14 @@ # cython: profile=False # cython: boundscheck=False, initializedcheck=False from cython cimport Py_ssize_t -from libc.stdint cimport ( - int64_t, - uint8_t, - uint16_t, -) - import numpy as np import pandas.io.sas.sas_constants as const +ctypedef signed long long int64_t +ctypedef unsigned char uint8_t +ctypedef unsigned short uint16_t + # rle_decompress decompresses data using a Run Length Encoding # algorithm. It is partially documented here: From 48028484fbbe617737e05015fe79a3d61ceb7764 Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Sun, 11 Sep 2022 01:05:30 +0200 Subject: [PATCH 12/17] Undo unrelated changes --- pandas/io/sas/sas.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index 3ba0067331328..d8591c0b033a6 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -9,7 +9,6 @@ ctypedef signed long long int64_t ctypedef unsigned char uint8_t ctypedef unsigned short uint16_t - # rle_decompress decompresses data using a Run Length Encoding # algorithm. It is partially documented here: # From 41abe02d93707892d3d2c5c8056a02401dac246c Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Sun, 11 Sep 2022 11:00:01 +0200 Subject: [PATCH 13/17] Lint --- pandas/tests/io/sas/test_byteswap.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/sas/test_byteswap.py b/pandas/tests/io/sas/test_byteswap.py index c11f8406b7803..2c88907df3b1d 100644 --- a/pandas/tests/io/sas/test_byteswap.py +++ b/pandas/tests/io/sas/test_byteswap.py @@ -7,6 +7,8 @@ import numpy as np import pytest +import pandas._testing as tm + from pandas.io.sas._byteswap import ( read_double_with_byteswap, read_float_with_byteswap, @@ -35,18 +37,18 @@ def test_float_byteswap(read_offset, number, float_type, should_byteswap): def _test(number, number_type, read_offset, should_byteswap): - number = number_type([number]) + number = number_type(number) data = np.random.default_rng().integers(0, 256, size=20, dtype="uint8") - data[read_offset : read_offset + number.itemsize] = number.view("uint8") + data[read_offset : read_offset + number.itemsize] = number[None].view("uint8") swap_func = { np.float32: read_float_with_byteswap, np.float64: read_double_with_byteswap, np.uint16: read_uint16_with_byteswap, np.uint32: read_uint32_with_byteswap, np.uint64: read_uint64_with_byteswap, - }[type(number[0])] - output_number = swap_func(bytes(data), read_offset, should_byteswap) + }[type(number)] + output_number = number_type(swap_func(bytes(data), read_offset, should_byteswap)) if should_byteswap: - np.testing.assert_equal(output_number, number.byteswap()) + tm.assert_equal(output_number, number.byteswap()) else: - np.testing.assert_equal(output_number, number) + tm.assert_equal(output_number, number) From bf0976a608fe2dcca56f94c85c6b087296ee53b4 Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Thu, 15 Sep 2022 11:40:39 +0200 Subject: [PATCH 14/17] Update v1.6.0.rst --- doc/source/whatsnew/v1.6.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index e139af76a4926..89c6ec2b8b5fb 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -112,7 +112,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`) - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) - Performance improvement in ``var`` for nullable dtypes (:issue:`48379`). -- Performance improvement to :func:`read_sas` with ``blank_missing=True`` (:issue:`48502`) +- Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`) - .. --------------------------------------------------------------------------- From c7c1a2f46e77f8a52074ba5ca17e012187748fad Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Tue, 4 Oct 2022 11:46:12 +0200 Subject: [PATCH 15/17] read_int -> read_uint --- pandas/io/sas/sas7bdat.py | 66 +++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 23a62fe4c6def..ccb1cd8f3d9d3 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -310,7 +310,7 @@ def _get_properties(self) -> None: ) self.date_modified = epoch + pd.to_timedelta(x, unit="s") - self.header_length = self._read_int( + self.header_length = self._read_uint( const.header_size_offset + align1, const.header_size_length ) @@ -322,10 +322,10 @@ def _get_properties(self) -> None: if len(self._cached_page) != self.header_length: # type: ignore[arg-type] raise ValueError("The SAS7BDAT file appears to be truncated.") - self._page_length = self._read_int( + self._page_length = self._read_uint( const.page_size_offset + align1, const.page_size_length ) - self._page_count = self._read_int( + self._page_count = self._read_uint( const.page_count_offset + align1, const.page_count_length ) @@ -371,8 +371,8 @@ def _read_float(self, offset: int, width: int): self.close() raise ValueError("invalid float width") - # Read a single signed integer of the given width (1, 2, 4 or 8). - def _read_int(self, offset: int, width: int) -> int: + # Read a single unsigned integer of the given width (1, 2, 4 or 8). + def _read_uint(self, offset: int, width: int) -> int: assert self._cached_page is not None if width == 1: return self._read_bytes(offset, 1)[0] @@ -431,12 +431,12 @@ def _read_page_header(self) -> None: bit_offset = self._page_bit_offset tx = const.page_type_offset + bit_offset self._current_page_type = ( - self._read_int(tx, const.page_type_length) & const.page_type_mask2 + self._read_uint(tx, const.page_type_length) & const.page_type_mask2 ) tx = const.block_count_offset + bit_offset - self._current_page_block_count = self._read_int(tx, const.block_count_length) + self._current_page_block_count = self._read_uint(tx, const.block_count_length) tx = const.subheader_count_offset + bit_offset - self._current_page_subheaders_count = self._read_int( + self._current_page_subheaders_count = self._read_uint( tx, const.subheader_count_length ) @@ -477,16 +477,16 @@ def _process_subheader_pointers( subheader_pointer_length = self._subheader_pointer_length total_offset = offset + subheader_pointer_length * subheader_pointer_index - subheader_offset = self._read_int(total_offset, self._int_length) + subheader_offset = self._read_uint(total_offset, self._int_length) total_offset += self._int_length - subheader_length = self._read_int(total_offset, self._int_length) + subheader_length = self._read_uint(total_offset, self._int_length) total_offset += self._int_length - subheader_compression = self._read_int(total_offset, 1) + subheader_compression = self._read_uint(total_offset, 1) total_offset += 1 - subheader_type = self._read_int(total_offset, 1) + subheader_type = self._read_uint(total_offset, 1) x = _SubheaderPointer( subheader_offset, subheader_length, subheader_compression, subheader_type @@ -540,27 +540,27 @@ def _process_rowsize_subheader(self, offset: int, length: int) -> None: lcs_offset += 354 lcp_offset += 378 - self.row_length = self._read_int( + self.row_length = self._read_uint( offset + const.row_length_offset_multiplier * int_len, int_len ) - self.row_count = self._read_int( + self.row_count = self._read_uint( offset + const.row_count_offset_multiplier * int_len, int_len ) - self.col_count_p1 = self._read_int( + self.col_count_p1 = self._read_uint( offset + const.col_count_p1_multiplier * int_len, int_len ) - self.col_count_p2 = self._read_int( + self.col_count_p2 = self._read_uint( offset + const.col_count_p2_multiplier * int_len, int_len ) mx = const.row_count_on_mix_page_offset_multiplier * int_len - self._mix_page_row_count = self._read_int(offset + mx, int_len) - self._lcs = self._read_int(lcs_offset, 2) - self._lcp = self._read_int(lcp_offset, 2) + self._mix_page_row_count = self._read_uint(offset + mx, int_len) + self._lcs = self._read_uint(lcs_offset, 2) + self._lcp = self._read_uint(lcp_offset, 2) def _process_columnsize_subheader(self, offset: int, length: int) -> None: int_len = self._int_length offset += int_len - self.column_count = self._read_int(offset, int_len) + self.column_count = self._read_uint(offset, int_len) if self.col_count_p1 + self.col_count_p2 != self.column_count: print( f"Warning: column count mismatch ({self.col_count_p1} + " @@ -574,7 +574,7 @@ def _process_subheader_counts(self, offset: int, length: int) -> None: def _process_columntext_subheader(self, offset: int, length: int) -> None: offset += self._int_length - text_block_size = self._read_int(offset, const.text_block_size_length) + text_block_size = self._read_uint(offset, const.text_block_size_length) buf = self._read_bytes(offset, text_block_size) cname_raw = buf[0:text_block_size].rstrip(b"\x00 ") @@ -638,13 +638,13 @@ def _process_columnname_subheader(self, offset: int, length: int) -> None: + const.column_name_length_offset ) - idx = self._read_int( + idx = self._read_uint( text_subheader, const.column_name_text_subheader_length ) - col_offset = self._read_int( + col_offset = self._read_uint( col_name_offset, const.column_name_offset_length ) - col_len = self._read_int(col_name_length, const.column_name_length_length) + col_len = self._read_uint(col_name_length, const.column_name_length_length) name_raw = self.column_names_raw[idx] cname = name_raw[col_offset : col_offset + col_len] @@ -667,13 +667,13 @@ def _process_columnattributes_subheader(self, offset: int, length: int) -> None: offset + 2 * int_len + const.column_type_offset + i * (int_len + 8) ) - x = self._read_int(col_data_offset, int_len) + x = self._read_uint(col_data_offset, int_len) self._column_data_offsets.append(x) - x = self._read_int(col_data_len, const.column_data_length_length) + x = self._read_uint(col_data_len, const.column_data_length_length) self._column_data_lengths.append(x) - x = self._read_int(col_types, const.column_type_length) + x = self._read_uint(col_types, const.column_type_length) self._column_types.append(b"d" if x == 1 else b"s") def _process_columnlist_subheader(self, offset: int, length: int) -> None: @@ -693,23 +693,23 @@ def _process_format_subheader(self, offset: int, length: int) -> None: col_label_offset = offset + const.column_label_offset_offset + 3 * int_len col_label_len = offset + const.column_label_length_offset + 3 * int_len - x = self._read_int( + x = self._read_uint( text_subheader_format, const.column_format_text_subheader_index_length ) format_idx = min(x, len(self.column_names_raw) - 1) - format_start = self._read_int( + format_start = self._read_uint( col_format_offset, const.column_format_offset_length ) - format_len = self._read_int(col_format_len, const.column_format_length_length) + format_len = self._read_uint(col_format_len, const.column_format_length_length) - label_idx = self._read_int( + label_idx = self._read_uint( text_subheader_label, const.column_label_text_subheader_index_length ) label_idx = min(label_idx, len(self.column_names_raw) - 1) - label_start = self._read_int(col_label_offset, const.column_label_offset_length) - label_len = self._read_int(col_label_len, const.column_label_length_length) + label_start = self._read_uint(col_label_offset, const.column_label_offset_length) + label_len = self._read_uint(col_label_len, const.column_label_length_length) label_names = self.column_names_raw[label_idx] column_label = self._convert_header_text( From 6a4a556c19e1795c7174b04f5ecc2a9eaed8e572 Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Tue, 4 Oct 2022 12:00:58 +0200 Subject: [PATCH 16/17] Lint --- pandas/io/sas/sas7bdat.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index ccb1cd8f3d9d3..c331064f72ee3 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -708,7 +708,9 @@ def _process_format_subheader(self, offset: int, length: int) -> None: ) label_idx = min(label_idx, len(self.column_names_raw) - 1) - label_start = self._read_uint(col_label_offset, const.column_label_offset_length) + label_start = self._read_uint( + col_label_offset, const.column_label_offset_length + ) label_len = self._read_uint(col_label_len, const.column_label_length_length) label_names = self.column_names_raw[label_idx] From a4394348f49ab0dd0d70c48edf03bb6ee38b1569 Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Tue, 4 Oct 2022 21:39:44 +0200 Subject: [PATCH 17/17] Update sas7bdat.py --- pandas/io/sas/sas7bdat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 5fd89319d3115..c9e1cd7940d7e 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -459,7 +459,7 @@ def _process_page_metadata(self) -> None: subheader_compression = self._read_uint(total_offset, 1) total_offset += 1 - subheader_type = self._read_int(total_offset, 1) + subheader_type = self._read_uint(total_offset, 1) if ( subheader_length == 0