pandas-dev · mroeschke · Oct 5, 2022 · Jun 17, 2022 · Jun 17, 2022 · Jul 9, 2022
diff --git a/pandas/io/sas/_sas.pyi b/pandas/io/sas/_sas.pyi
@@ -3,3 +3,9 @@ from pandas.io.sas.sas7bdat import SAS7BDATReader
 class Parser:
     def __init__(self, parser: SAS7BDATReader) -> None: ...
     def read(self, nrows: int) -> None: ...
+
+def read_float_with_byteswap(data: bytes, byteswap: bool) -> float: ...
+def read_double_with_byteswap(data: bytes, byteswap: bool) -> float: ...
+def read_uint16_with_byteswap(data: bytes, byteswap: bool) -> int: ...
+def read_uint32_with_byteswap(data: bytes, byteswap: bool) -> int: ...
+def read_uint64_with_byteswap(data: bytes, byteswap: bool) -> int: ...
diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx
@@ -1,13 +1,19 @@
 # cython: profile=False
 # cython: boundscheck=False, initializedcheck=False
 from cython cimport Py_ssize_t
+from libc.stdint cimport (
+    int64_t,
+    uint8_t,
+    uint16_t,
+    uint32_t,
+    uint64_t,
+)
+from libc.string cimport memcpy
+
 import numpy as np
 
 import pandas.io.sas.sas_constants as const
 
-ctypedef signed long long   int64_t
-ctypedef unsigned char      uint8_t
-ctypedef unsigned short     uint16_t
 
 # rle_decompress decompresses data using a Run Length Encoding
 # algorithm.  It is partially documented here:
@@ -433,3 +439,73 @@ cdef class Parser:
         self.current_row_on_page_index += 1
         self.current_row_in_chunk_index += 1
         self.current_row_in_file_index += 1
+
+
+def read_float_with_byteswap(const uint8_t *data, bint byteswap):
+    cdef float res = (<float*>data)[0]
+    if byteswap:
+        res = _byteswap_float(res)
+    return res
+
+
+def read_double_with_byteswap(const uint8_t *data, bint byteswap):
+    cdef double res = (<double*>data)[0]
+    if byteswap:
+        res = _byteswap_double(res)
+    return res
+
+
+def read_uint16_with_byteswap(const uint8_t *data, bint byteswap):
+    cdef uint16_t res = (<uint16_t *>data)[0]
+    if byteswap:
+        res = _byteswap2(res)
+    return res
+
+
+def read_uint32_with_byteswap(const uint8_t *data, bint byteswap):
+    cdef uint32_t res = (<uint32_t *>data)[0]
+    if byteswap:
+        res = _byteswap4(res)
+    return res
+
+
+def read_uint64_with_byteswap(const uint8_t *data, bint byteswap):
+    cdef uint64_t res = (<uint64_t *>data)[0]
+    if byteswap:
+        res = _byteswap8(res)
+    return res
+
+
+# Byteswapping
+# From https://github.com/WizardMac/ReadStat/blob/master/src/readstat_bits.
+# Copyright (c) 2013-2016 Evan Miller, Apache 2 License
+
+cdef inline uint16_t _byteswap2(uint16_t num):
+    return ((num & 0xFF00) >> 8) | ((num & 0x00FF) << 8)
+
+
+cdef inline uint32_t _byteswap4(uint32_t num):
+    num = ((num & <uint32_t>0xFFFF0000) >> 16) | ((num & <uint32_t>0x0000FFFF) << 16)
+    return ((num & <uint32_t>0xFF00FF00) >> 8) | ((num & <uint32_t>0x00FF00FF) << 8)
+
+
+cdef inline uint64_t _byteswap8(uint64_t num):
+    num = ((num & <uint64_t>0xFFFFFFFF00000000) >> 32) | ((num & <uint64_t>0x00000000FFFFFFFF) << 32)
+    num = ((num & <uint64_t>0xFFFF0000FFFF0000) >> 16) | ((num & <uint64_t>0x0000FFFF0000FFFF) << 16)
+    return ((num & <uint64_t>0xFF00FF00FF00FF00) >> 8) | ((num & <uint64_t>0x00FF00FF00FF00FF) << 8)
+
+
+cdef inline float _byteswap_float(float num):
+    cdef uint32_t answer = 0
+    memcpy(&answer, &num, 4)
+    answer = _byteswap4(answer)
+    memcpy(&num, &answer, 4)
+    return num
+
+
+cdef inline double _byteswap_double(double num):
+    cdef uint64_t answer = 0
+    memcpy(&answer, &num, 8)
+    answer = _byteswap8(answer)
+    memcpy(&num, &answer, 8)
+    return num
diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py
@@ -20,7 +20,7 @@
     datetime,
     timedelta,
 )
-import struct
+import sys
 from typing import cast
 
 import numpy as np
@@ -42,7 +42,14 @@
 )
 
 from pandas.io.common import get_handle
-from pandas.io.sas._sas import Parser
+from pandas.io.sas._sas import (
+    Parser,
+    read_double_with_byteswap,
+    read_float_with_byteswap,
+    read_uint16_with_byteswap,
+    read_uint32_with_byteswap,
+    read_uint64_with_byteswap,
+)
 import pandas.io.sas.sas_constants as const
 from pandas.io.sas.sasreader import ReaderBase
 
@@ -259,8 +266,10 @@ def _get_properties(self) -> None:
         buf = self._read_bytes(const.endianness_offset, const.endianness_length)
         if buf == b"\x01":
             self.byte_order = "<"
+            self.need_byteswap = sys.byteorder == "big"
         else:
             self.byte_order = ">"
+            self.need_byteswap = sys.byteorder == "little"
 
         # Get encoding information
         buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
@@ -345,22 +354,37 @@ def __next__(self) -> DataFrame:
 
     # Read a single float of the given width (4 or 8).
     def _read_float(self, offset: int, width: int):
-        if width not in (4, 8):
+        if width == 4:
+            return read_float_with_byteswap(
+                self._read_bytes(offset, 4), self.need_byteswap
+            )
+        elif width == 8:
+            return read_double_with_byteswap(
+                self._read_bytes(offset, 8), self.need_byteswap
+            )
+        else:
             self.close()
             raise ValueError("invalid float width")
-        buf = self._read_bytes(offset, width)
-        fd = "f" if width == 4 else "d"
-        return struct.unpack(self.byte_order + fd, buf)[0]
 
     # Read a single signed integer of the given width (1, 2, 4 or 8).
     def _read_int(self, offset: int, width: int) -> int:
-        if width not in (1, 2, 4, 8):
+        if width == 1:
+            return self._read_bytes(offset, 1)[0]
+        elif width == 2:
+            return read_uint16_with_byteswap(
+                self._read_bytes(offset, 2), self.need_byteswap
+            )
+        elif width == 4:
+            return read_uint32_with_byteswap(
+                self._read_bytes(offset, 4), self.need_byteswap
+            )
+        elif width == 8:
+            return read_uint64_with_byteswap(
+                self._read_bytes(offset, 8), self.need_byteswap
+            )
+        else:
             self.close()
             raise ValueError("invalid int width")
-        buf = self._read_bytes(offset, width)
-        it = {1: "b", 2: "h", 4: "l", 8: "q"}[width]
-        iv = struct.unpack(self.byte_order + it, buf)[0]
-        return iv
 
     def _read_bytes(self, offset: int, length: int):
         if self._cached_page is None: