Skip to content

Commit f5293f7

Browse files
committed
ENH: allow user to infer SAS file encoding; use detected encoding; add correct encoding names
1 parent f0814bc commit f5293f7

File tree

3 files changed

+42
-9
lines changed

3 files changed

+42
-9
lines changed

pandas/io/sas/sas7bdat.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -147,8 +147,10 @@ class SAS7BDATReader(ReaderBase, abc.Iterator):
147147
chunksize : int, defaults to None
148148
Return SAS7BDATReader object for iterations, returns chunks
149149
with given number of lines.
150-
encoding : string, defaults to None
151-
String encoding.
150+
encoding : str, 'infer', defaults to None
151+
String encoding acc. to python standard encodings,
152+
encoding='infer' tries to detect the encoding from the file header,
153+
encoding=None will leave the data in binary format.
152154
convert_text : bool, defaults to True
153155
If False, text variables are left as raw bytes.
154156
convert_header_text : bool, defaults to True
@@ -265,9 +267,11 @@ def _get_properties(self) -> None:
265267
# Get encoding information
266268
buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
267269
if buf in const.encoding_names:
268-
self.file_encoding = const.encoding_names[buf]
270+
self.inferred_encoding = const.encoding_names[buf]
271+
if self.encoding == "infer":
272+
self.encoding = self.inferred_encoding
269273
else:
270-
self.file_encoding = f"unknown (code={buf})"
274+
self.inferred_encoding = f"unknown (code={buf})"
271275

272276
# Get platform information
273277
buf = self._read_bytes(const.platform_offset, const.platform_length)

pandas/io/sas/sas_constants.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -107,14 +107,21 @@
107107
compression_literals: Final = [rle_compression, rdc_compression]
108108

109109
# Incomplete list of encodings, using SAS nomenclature:
110-
# http://support.sas.com/documentation/cdl/en/nlsref/61893/HTML/default/viewer.htm#a002607278.htm
110+
# https://support.sas.com/documentation/onlinedoc/dfdmstudio/2.6/dmpdmsug/Content/dfU_Encodings_SAS.html
111+
# corresponding to the Python documentation of standard encodings
112+
# https://docs.python.org/3/library/codecs.html#standard-encodings
111113
encoding_names: Final = {
112-
29: "latin1",
113114
20: "utf-8",
115+
29: "latin1",
116+
30: "latin2",
117+
31: "latin3",
118+
32: "latin4",
114119
33: "cyrillic",
115-
60: "wlatin2",
116-
61: "wcyrillic",
117-
62: "wlatin1",
120+
34: "arabic",
121+
35: "greek",
122+
60: "cp1250", # wlatin2
123+
61: "cp1251", # wcyrillic
124+
62: "cp1252", # wlatin1
118125
90: "ebcdic870",
119126
}
120127

pandas/tests/io/sas/test_sas7bdat.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,28 @@ def test_encoding_options(datapath):
136136
assert x == y.decode()
137137

138138

139+
def test_encoding_infer(datapath):
140+
from pandas.io.sas.sas7bdat import SAS7BDATReader
141+
142+
fname = datapath("io", "sas", "data", "test1.sas7bdat")
143+
144+
# check if inferred correctly
145+
df1_reader: SAS7BDATReader = pd.read_sas(fname, encoding="infer", iterator=True)
146+
assert (
147+
df1_reader.inferred_encoding == "cp1252"
148+
), f"""
149+
Encoding has been inferred incorrectly:
150+
{df1_reader.inferred_encoding} instead of 'cp1252'
151+
"""
152+
153+
# check if the reader reads correctly with encoding
154+
df1 = df1_reader.read()
155+
df2_reader: SAS7BDATReader = pd.read_sas(fname, encoding="cp1252", iterator=True)
156+
df2 = df2_reader.read()
157+
158+
tm.assert_frame_equal(df1, df2)
159+
160+
139161
def test_productsales(datapath):
140162
fname = datapath("io", "sas", "data", "productsales.sas7bdat")
141163
df = pd.read_sas(fname, encoding="utf-8")

0 commit comments

Comments
 (0)