From 2f02697264818123d1112ff1f09e1d2e95a5d94d Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Tue, 21 Mar 2017 16:50:48 +0000 Subject: [PATCH 1/2] BUG: Enforce correct encoding in stata Ensure StataReader and StataWriter have the correct encoding. Standardized default encoding to 'latin-1' closes #15723 --- doc/source/whatsnew/v0.20.0.txt | 3 +++ pandas/io/stata.py | 23 +++++++++++++++++------ pandas/tests/io/test_stata.py | 6 ++++++ 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index e0d15c218ec85..b57b6a3898fd4 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -917,6 +917,8 @@ Bug Fixes - Avoid use of ``np.finfo()`` during ``import pandas`` removed to mitigate deadlock on Python GIL misuse (:issue:`14641`) - Bug in ``DataFrame.to_stata()`` and ``StataWriter`` which produces incorrectly formatted files to be produced for some locales (:issue:`13856`) +- Bug in ``StataReader`` and ``StataWriter`` which allows invalid encodings (:issue:`15723`) + - Bug in ``pd.concat()`` in which concatting with an empty dataframe with ``join='inner'`` was being improperly handled (:issue:`15328`) - Bug in ``groupby.agg()`` incorrectly localizing timezone on ``datetime`` (:issue:`15426`, :issue:`10668`, :issue:`13046`) @@ -931,3 +933,4 @@ Bug Fixes - Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`) - Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`) - Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`) + diff --git a/pandas/io/stata.py b/pandas/io/stata.py index af4bc6a6b7ddb..12f8b8bce22c3 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -33,6 +33,9 @@ from pandas._libs.lib import max_len_string_array, infer_dtype from pandas._libs.tslib import NaT, Timestamp +VALID_ENCODINGS = ('ascii', 'us-ascii', 'latin-1', 'latin_1', 'iso-8859-1', + 'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1') + _version_error = ("Version of given Stata file is not 104, 105, 108, " "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), " "115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)") @@ -45,7 +48,7 @@ _encoding_params = """\ encoding : string, None or encoding - Encoding used to parse the files. None defaults to iso-8859-1.""" + Encoding used to parse the files. None defaults to latin-1.""" _statafile_processing_params2 = """\ index : identifier of index column @@ -153,7 +156,7 @@ @Appender(_read_stata_doc) def read_stata(filepath_or_buffer, convert_dates=True, - convert_categoricals=True, encoding=None, index=None, + convert_categoricals=True, encoding='latin-1', index=None, convert_missing=False, preserve_dtypes=True, columns=None, order_categoricals=True, chunksize=None, iterator=False): @@ -816,9 +819,14 @@ def get_base_missing_value(cls, dtype): class StataParser(object): - _default_encoding = 'iso-8859-1' + _default_encoding = 'latin-1' + + def __init__(self, encoding='latin-1'): + + if encoding not in VALID_ENCODINGS: + raise ValueError('Unknown encoding. Only latin-1 and ascii ' + 'supported.') - def __init__(self, encoding): self._encoding = encoding # type code. @@ -936,7 +944,7 @@ def __init__(self, path_or_buf, convert_dates=True, convert_categoricals=True, index=None, convert_missing=False, preserve_dtypes=True, columns=None, order_categoricals=True, - encoding='iso-8859-1', chunksize=None): + encoding='latin-1', chunksize=None): super(StataReader, self).__init__(encoding) self.col_sizes = () @@ -949,6 +957,9 @@ def __init__(self, path_or_buf, convert_dates=True, self._preserve_dtypes = preserve_dtypes self._columns = columns self._order_categoricals = order_categoricals + if encoding not in VALID_ENCODINGS: + raise ValueError('Unknown encoding. Only latin-1 and ascii ' + 'supported.') self._encoding = encoding self._chunksize = chunksize @@ -1855,7 +1866,7 @@ class StataWriter(StataParser): write_index : bool Write the index to Stata dataset. encoding : str - Default is latin-1. Unicode is not supported + Default is latin-1. Only latin-1 and ascii are supported. byteorder : str Can be ">", "<", "little", or "big". default is `sys.byteorder` time_stamp : datetime diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 5188adf54b887..5f6caa27ded2a 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1276,3 +1276,9 @@ def test_out_of_range_float(self): original.to_stata(path) tm.assertTrue('ColumnTooBig' in cm.exception) tm.assertTrue('infinity' in cm.exception) + + def test_invalid_encoding(self): + original = self.read_csv(self.csv3) + with tm.assertRaises(ValueError): + with tm.ensure_clean() as path: + original.to_stata(path, encoding='utf-8') From 8278be7dc466229f92f938e38514b9225a65b1f9 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Tue, 21 Mar 2017 18:17:47 +0000 Subject: [PATCH 2/2] BUG: Fix limited key range on 32-bit platofrms Fix use of 64-bit integers as keys in general string objects (GSO) by wrapping in strings when used as dictionary keys --- pandas/io/stata.py | 28 ++++++++++++++++------------ pandas/tests/io/test_stata.py | 1 + 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 12f8b8bce22c3..1d2951da68086 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -156,7 +156,7 @@ @Appender(_read_stata_doc) def read_stata(filepath_or_buffer, convert_dates=True, - convert_categoricals=True, encoding='latin-1', index=None, + convert_categoricals=True, encoding=None, index=None, convert_missing=False, preserve_dtypes=True, columns=None, order_categoricals=True, chunksize=None, iterator=False): @@ -821,11 +821,11 @@ def get_base_missing_value(cls, dtype): class StataParser(object): _default_encoding = 'latin-1' - def __init__(self, encoding='latin-1'): - - if encoding not in VALID_ENCODINGS: - raise ValueError('Unknown encoding. Only latin-1 and ascii ' - 'supported.') + def __init__(self, encoding): + if encoding is not None: + if encoding not in VALID_ENCODINGS: + raise ValueError('Unknown encoding. Only latin-1 and ascii ' + 'supported.') self._encoding = encoding @@ -957,9 +957,10 @@ def __init__(self, path_or_buf, convert_dates=True, self._preserve_dtypes = preserve_dtypes self._columns = columns self._order_categoricals = order_categoricals - if encoding not in VALID_ENCODINGS: - raise ValueError('Unknown encoding. Only latin-1 and ascii ' - 'supported.') + if encoding is not None: + if encoding not in VALID_ENCODINGS: + raise ValueError('Unknown encoding. Only latin-1 and ascii ' + 'supported.') self._encoding = encoding self._chunksize = chunksize @@ -1373,7 +1374,8 @@ def _read_value_labels(self): def _read_strls(self): self.path_or_buf.seek(self.seek_strls) - self.GSO = {0: ''} + # Wrap v_o in a string to allow uint64 values as keys on 32bit OS + self.GSO = {'0': ''} while True: if self.path_or_buf.read(3) != b'GSO': break @@ -1398,7 +1400,8 @@ def _read_strls(self): if self.format_version == 117: encoding = self._encoding or self._default_encoding va = va[0:-1].decode(encoding) - self.GSO[v_o] = va + # Wrap v_o in a string to allow uint64 values as keys on 32bit OS + self.GSO[str(v_o)] = va # legacy @Appender('DEPRECATED: ' + _data_method_doc) @@ -1634,7 +1637,8 @@ def _insert_strls(self, data): for i, typ in enumerate(self.typlist): if typ != 'Q': continue - data.iloc[:, i] = [self.GSO[k] for k in data.iloc[:, i]] + # Wrap v_o in a string to allow uint64 values as keys on 32bit OS + data.iloc[:, i] = [self.GSO[str(k)] for k in data.iloc[:, i]] return data def _do_select_columns(self, data, columns): diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 5f6caa27ded2a..064f7de971919 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1277,6 +1277,7 @@ def test_out_of_range_float(self): tm.assertTrue('ColumnTooBig' in cm.exception) tm.assertTrue('infinity' in cm.exception) + # GH15723, validate encoding def test_invalid_encoding(self): original = self.read_csv(self.csv3) with tm.assertRaises(ValueError):