diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index d644a995a4876..5d962ff04464e 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -302,6 +302,7 @@ I/O timestamps with ``version="2.0"`` (:issue:`31652`). - Bug in :meth:`read_csv` was raising `TypeError` when `sep=None` was used in combination with `comment` keyword (:issue:`31396`) - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`) +- Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`) Plotting diff --git a/pandas/_libs/src/parse_helper.h b/pandas/_libs/src/parse_helper.h index 7fbe7a04d5b22..2ada0a4bd173d 100644 --- a/pandas/_libs/src/parse_helper.h +++ b/pandas/_libs/src/parse_helper.h @@ -34,6 +34,9 @@ int floatify(PyObject *str, double *result, int *maybe_int) { data = PyBytes_AS_STRING(str); } else if (PyUnicode_Check(str)) { tmp = PyUnicode_AsUTF8String(str); + if (tmp == NULL) { + return -1; + } data = PyBytes_AS_STRING(tmp); } else { PyErr_SetString(PyExc_TypeError, "Invalid object type"); diff --git a/pandas/tests/io/data/excel/high_surrogate.xlsx b/pandas/tests/io/data/excel/high_surrogate.xlsx new file mode 100644 index 0000000000000..1e29b6bee6586 Binary files /dev/null and b/pandas/tests/io/data/excel/high_surrogate.xlsx differ diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index a59b409809eed..cbc043820e35e 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1044,3 +1044,11 @@ def test_excel_read_binary(self, engine, read_ext): actual = pd.read_excel(data, engine=engine) tm.assert_frame_equal(expected, actual) + + def test_excel_high_surrogate(self, engine): + # GH 23809 + expected = pd.DataFrame(["\udc88"], columns=["Column1"]) + + # should not produce a segmentation violation + actual = pd.read_excel("high_surrogate.xlsx") + tm.assert_frame_equal(expected, actual)