Skip to content

SAS7BDAT parser: Fast byteswap #47403

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Oct 5, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
5b9cd4b
Fast byteswap
jonashaag Jun 17, 2022
17c965f
Add types
jonashaag Jun 17, 2022
51499fb
Merge branch 'main' into sas/byteswap
jonashaag Jul 9, 2022
435a003
Review feedback
jonashaag Jul 9, 2022
10ab87f
Slightly faster variant (1 less bytes obj construction)
jonashaag Jul 9, 2022
ad74f5c
Make MyPy happy?
jonashaag Jul 10, 2022
9c5b4b3
Update sas7bdat.py
jonashaag Jul 11, 2022
21c364c
Merge branch 'main' into sas/byteswap
jonashaag Jul 11, 2022
148fa75
Merge branch 'main' into sas/byteswap
jonashaag Jul 15, 2022
f3c63f0
Use intrinsics
jonashaag Jul 21, 2022
78de495
Merge branch 'main' into sas/byteswap
jonashaag Aug 8, 2022
4ef928e
Merge branch 'main' into sas/byteswap
jonashaag Sep 10, 2022
c310c0d
Lint
jonashaag Sep 10, 2022
3b7ba83
Add tests + move byteswap to module
jonashaag Sep 10, 2022
53fbce2
Add float tests + refactoring
jonashaag Sep 10, 2022
9cbc5be
Undo unrelated changes
jonashaag Sep 10, 2022
4802848
Undo unrelated changes
jonashaag Sep 10, 2022
41abe02
Lint
jonashaag Sep 11, 2022
2abd8e0
Merge branch 'main' into sas/byteswap
jonashaag Sep 15, 2022
bf0976a
Update v1.6.0.rst
jonashaag Sep 15, 2022
c725d49
Merge branch 'main' into sas/byteswap
jonashaag Sep 30, 2022
c7c1a2f
read_int -> read_uint
jonashaag Oct 4, 2022
6a4a556
Lint
jonashaag Oct 4, 2022
9f5ba3f
Merge branch 'main' into sas/byteswap
jonashaag Oct 4, 2022
a439434
Update sas7bdat.py
jonashaag Oct 4, 2022
55bd863
Merge branch 'main' into sas/byteswap
jonashaag Oct 4, 2022
bdf8203
Merge branch 'main' into sas/byteswap
jonashaag Oct 5, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions pandas/io/sas/_sas.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,9 @@ from pandas.io.sas.sas7bdat import SAS7BDATReader
class Parser:
def __init__(self, parser: SAS7BDATReader) -> None: ...
def read(self, nrows: int) -> None: ...

def read_float_with_byteswap(data: bytes, byteswap: bool) -> float: ...
def read_double_with_byteswap(data: bytes, byteswap: bool) -> float: ...
def read_uint16_with_byteswap(data: bytes, byteswap: bool) -> int: ...
def read_uint32_with_byteswap(data: bytes, byteswap: bool) -> int: ...
def read_uint64_with_byteswap(data: bytes, byteswap: bool) -> int: ...
82 changes: 79 additions & 3 deletions pandas/io/sas/sas.pyx
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
# cython: profile=False
# cython: boundscheck=False, initializedcheck=False
from cython cimport Py_ssize_t
from libc.stdint cimport (
int64_t,
uint8_t,
uint16_t,
uint32_t,
uint64_t,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just curious, are these interchangeable with the versions of these we cimport from numpy?

Copy link
Contributor Author

@jonashaag jonashaag Jun 20, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

doesn't make a huge difference. on the one hand itd be nice to avoid dependency on numpy when possible, on the other im inevitably going to forget and ask again in 6 months if we dont use the numpy versions

)
from libc.string cimport memcpy

import numpy as np

import pandas.io.sas.sas_constants as const

ctypedef signed long long int64_t
ctypedef unsigned char uint8_t
ctypedef unsigned short uint16_t

# rle_decompress decompresses data using a Run Length Encoding
# algorithm. It is partially documented here:
Expand Down Expand Up @@ -433,3 +439,73 @@ cdef class Parser:
self.current_row_on_page_index += 1
self.current_row_in_chunk_index += 1
self.current_row_in_file_index += 1


def read_float_with_byteswap(const uint8_t *data, bint byteswap):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add some comments here on what and why you are doing this.

cdef float res = (<float*>data)[0]
if byteswap:
res = _byteswap_float(res)
return res


def read_double_with_byteswap(const uint8_t *data, bint byteswap):
cdef double res = (<double*>data)[0]
if byteswap:
res = _byteswap_double(res)
return res


def read_uint16_with_byteswap(const uint8_t *data, bint byteswap):
cdef uint16_t res = (<uint16_t *>data)[0]
if byteswap:
res = _byteswap2(res)
return res


def read_uint32_with_byteswap(const uint8_t *data, bint byteswap):
cdef uint32_t res = (<uint32_t *>data)[0]
if byteswap:
res = _byteswap4(res)
return res


def read_uint64_with_byteswap(const uint8_t *data, bint byteswap):
cdef uint64_t res = (<uint64_t *>data)[0]
if byteswap:
res = _byteswap8(res)
return res


# Byteswapping
# From https://github.com/WizardMac/ReadStat/blob/master/src/readstat_bits.
# Copyright (c) 2013-2016 Evan Miller, Apache 2 License

cdef inline uint16_t _byteswap2(uint16_t num):
return ((num & 0xFF00) >> 8) | ((num & 0x00FF) << 8)


cdef inline uint32_t _byteswap4(uint32_t num):
num = ((num & <uint32_t>0xFFFF0000) >> 16) | ((num & <uint32_t>0x0000FFFF) << 16)
return ((num & <uint32_t>0xFF00FF00) >> 8) | ((num & <uint32_t>0x00FF00FF) << 8)


cdef inline uint64_t _byteswap8(uint64_t num):
num = ((num & <uint64_t>0xFFFFFFFF00000000) >> 32) | ((num & <uint64_t>0x00000000FFFFFFFF) << 32)
num = ((num & <uint64_t>0xFFFF0000FFFF0000) >> 16) | ((num & <uint64_t>0x0000FFFF0000FFFF) << 16)
return ((num & <uint64_t>0xFF00FF00FF00FF00) >> 8) | ((num & <uint64_t>0x00FF00FF00FF00FF) << 8)


cdef inline float _byteswap_float(float num):
cdef uint32_t answer = 0
memcpy(&answer, &num, 4)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you use sizeof instead of hard coding the size?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a verbatim copy from ReadStat, do you still want me to make that modification?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah ok. I think fine to keep as is then

answer = _byteswap4(answer)
memcpy(&num, &answer, 4)
return num


cdef inline double _byteswap_double(double num):
cdef uint64_t answer = 0
memcpy(&answer, &num, 8)
answer = _byteswap8(answer)
memcpy(&num, &answer, 8)
return num
46 changes: 35 additions & 11 deletions pandas/io/sas/sas7bdat.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
datetime,
timedelta,
)
import struct
import sys
from typing import cast

import numpy as np
Expand All @@ -42,7 +42,14 @@
)

from pandas.io.common import get_handle
from pandas.io.sas._sas import Parser
from pandas.io.sas._sas import (
Parser,
read_double_with_byteswap,
read_float_with_byteswap,
read_uint16_with_byteswap,
read_uint32_with_byteswap,
read_uint64_with_byteswap,
)
import pandas.io.sas.sas_constants as const
from pandas.io.sas.sasreader import ReaderBase

Expand Down Expand Up @@ -259,8 +266,10 @@ def _get_properties(self) -> None:
buf = self._read_bytes(const.endianness_offset, const.endianness_length)
if buf == b"\x01":
self.byte_order = "<"
self.need_byteswap = sys.byteorder == "big"
else:
self.byte_order = ">"
self.need_byteswap = sys.byteorder == "little"

# Get encoding information
buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
Expand Down Expand Up @@ -345,22 +354,37 @@ def __next__(self) -> DataFrame:

# Read a single float of the given width (4 or 8).
def _read_float(self, offset: int, width: int):
if width not in (4, 8):
if width == 4:
return read_float_with_byteswap(
self._read_bytes(offset, 4), self.need_byteswap
)
elif width == 8:
return read_double_with_byteswap(
self._read_bytes(offset, 8), self.need_byteswap
)
else:
self.close()
raise ValueError("invalid float width")
buf = self._read_bytes(offset, width)
fd = "f" if width == 4 else "d"
return struct.unpack(self.byte_order + fd, buf)[0]

# Read a single signed integer of the given width (1, 2, 4 or 8).
def _read_int(self, offset: int, width: int) -> int:
if width not in (1, 2, 4, 8):
if width == 1:
return self._read_bytes(offset, 1)[0]
elif width == 2:
return read_uint16_with_byteswap(
self._read_bytes(offset, 2), self.need_byteswap
)
elif width == 4:
return read_uint32_with_byteswap(
self._read_bytes(offset, 4), self.need_byteswap
)
elif width == 8:
return read_uint64_with_byteswap(
self._read_bytes(offset, 8), self.need_byteswap
)
else:
self.close()
raise ValueError("invalid int width")
buf = self._read_bytes(offset, width)
it = {1: "b", 2: "h", 4: "l", 8: "q"}[width]
iv = struct.unpack(self.byte_order + it, buf)[0]
return iv

def _read_bytes(self, offset: int, length: int):
if self._cached_page is None:
Expand Down