-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
SAS7BDAT parser: Fast byteswap #47403
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
5b9cd4b
17c965f
51499fb
435a003
10ab87f
ad74f5c
9c5b4b3
21c364c
148fa75
f3c63f0
78de495
4ef928e
c310c0d
3b7ba83
53fbce2
9cbc5be
4802848
41abe02
2abd8e0
bf0976a
c725d49
c7c1a2f
6a4a556
9f5ba3f
a439434
55bd863
bdf8203
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,19 @@ | ||
# cython: profile=False | ||
# cython: boundscheck=False, initializedcheck=False | ||
from cython cimport Py_ssize_t | ||
from libc.stdint cimport ( | ||
int64_t, | ||
uint8_t, | ||
uint16_t, | ||
uint32_t, | ||
uint64_t, | ||
) | ||
from libc.string cimport memcpy | ||
|
||
import numpy as np | ||
|
||
import pandas.io.sas.sas_constants as const | ||
|
||
ctypedef signed long long int64_t | ||
ctypedef unsigned char uint8_t | ||
ctypedef unsigned short uint16_t | ||
|
||
# rle_decompress decompresses data using a Run Length Encoding | ||
# algorithm. It is partially documented here: | ||
|
@@ -433,3 +439,73 @@ cdef class Parser: | |
self.current_row_on_page_index += 1 | ||
self.current_row_in_chunk_index += 1 | ||
self.current_row_in_file_index += 1 | ||
|
||
|
||
def read_float_with_byteswap(const uint8_t *data, bint byteswap): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add some comments here on what and why you are doing this. |
||
cdef float res = (<float*>data)[0] | ||
if byteswap: | ||
res = _byteswap_float(res) | ||
return res | ||
|
||
|
||
def read_double_with_byteswap(const uint8_t *data, bint byteswap): | ||
cdef double res = (<double*>data)[0] | ||
if byteswap: | ||
res = _byteswap_double(res) | ||
return res | ||
|
||
|
||
def read_uint16_with_byteswap(const uint8_t *data, bint byteswap): | ||
cdef uint16_t res = (<uint16_t *>data)[0] | ||
if byteswap: | ||
res = _byteswap2(res) | ||
return res | ||
|
||
|
||
def read_uint32_with_byteswap(const uint8_t *data, bint byteswap): | ||
cdef uint32_t res = (<uint32_t *>data)[0] | ||
if byteswap: | ||
res = _byteswap4(res) | ||
return res | ||
|
||
|
||
def read_uint64_with_byteswap(const uint8_t *data, bint byteswap): | ||
cdef uint64_t res = (<uint64_t *>data)[0] | ||
if byteswap: | ||
res = _byteswap8(res) | ||
return res | ||
|
||
|
||
# Byteswapping | ||
# From https://github.com/WizardMac/ReadStat/blob/master/src/readstat_bits. | ||
# Copyright (c) 2013-2016 Evan Miller, Apache 2 License | ||
|
||
cdef inline uint16_t _byteswap2(uint16_t num): | ||
return ((num & 0xFF00) >> 8) | ((num & 0x00FF) << 8) | ||
|
||
|
||
cdef inline uint32_t _byteswap4(uint32_t num): | ||
num = ((num & <uint32_t>0xFFFF0000) >> 16) | ((num & <uint32_t>0x0000FFFF) << 16) | ||
return ((num & <uint32_t>0xFF00FF00) >> 8) | ((num & <uint32_t>0x00FF00FF) << 8) | ||
|
||
|
||
cdef inline uint64_t _byteswap8(uint64_t num): | ||
num = ((num & <uint64_t>0xFFFFFFFF00000000) >> 32) | ((num & <uint64_t>0x00000000FFFFFFFF) << 32) | ||
num = ((num & <uint64_t>0xFFFF0000FFFF0000) >> 16) | ((num & <uint64_t>0x0000FFFF0000FFFF) << 16) | ||
return ((num & <uint64_t>0xFF00FF00FF00FF00) >> 8) | ((num & <uint64_t>0x00FF00FF00FF00FF) << 8) | ||
|
||
|
||
cdef inline float _byteswap_float(float num): | ||
cdef uint32_t answer = 0 | ||
memcpy(&answer, &num, 4) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you use sizeof instead of hard coding the size? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a verbatim copy from ReadStat, do you still want me to make that modification? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah ok. I think fine to keep as is then |
||
answer = _byteswap4(answer) | ||
memcpy(&num, &answer, 4) | ||
return num | ||
|
||
|
||
cdef inline double _byteswap_double(double num): | ||
cdef uint64_t answer = 0 | ||
memcpy(&answer, &num, 8) | ||
answer = _byteswap8(answer) | ||
memcpy(&num, &answer, 8) | ||
return num |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just curious, are these interchangeable with the versions of these we cimport from numpy?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Interesting, I didn't know those exist.
size_t
and friends:https://github.com/cython/cython/blob/f753deecd09e011a1bc276b78ccc0f1c0ad67f09/Cython/Includes/numpy/__init__.pxd#L27-L32
uint64_t
and friends:https://github.com/cython/cython/blob/f753deecd09e011a1bc276b78ccc0f1c0ad67f09/Cython/Includes/numpy/__init__.pxd#L746 -> https://github.com/cython/cython/blob/f753deecd09e011a1bc276b78ccc0f1c0ad67f09/Cython/Includes/numpy/__init__.pxd#L325
So this looks identical in both cases, but I'm happy to import from NumPy if that's preferred.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
doesn't make a huge difference. on the one hand itd be nice to avoid dependency on numpy when possible, on the other im inevitably going to forget and ask again in 6 months if we dont use the numpy versions