Add extra check for failing UTF-8 conversion

roberthdevries · roberthdevries · commit b6bb3f9d8192 · 2020-03-08T23:10:31.000+01:00
fixes #23809 Unit test added
diff --git a/pandas/_libs/src/parse_helper.h b/pandas/_libs/src/parse_helper.h
@@ -34,6 +34,10 @@ int floatify(PyObject *str, double *result, int *maybe_int) {
         data = PyBytes_AS_STRING(str);
     } else if (PyUnicode_Check(str)) {
         tmp = PyUnicode_AsUTF8String(str);
+        if (tmp == NULL) {
+            PyErr_SetString(PyExc_TypeError, "UTF8 decode error");
+            return -1;
+        }
         data = PyBytes_AS_STRING(tmp);
     } else {
         PyErr_SetString(PyExc_TypeError, "Invalid object type");
diff --git a/pandas/tests/io/data/excel/high_surrogate.xlsx b/pandas/tests/io/data/excel/high_surrogate.xlsx
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -883,7 +883,6 @@ def test_read_excel_squeeze(self, read_ext):
         expected = pd.Series([1, 2, 3], name="a")
         tm.assert_series_equal(actual, expected)
 
-
 class TestExcelFileRead:
     @pytest.fixture(autouse=True)
     def cd_and_set_engine(self, engine, datapath, monkeypatch):
@@ -1044,3 +1043,9 @@ def test_excel_read_binary(self, engine, read_ext):
 
         actual = pd.read_excel(data, engine=engine)
         tm.assert_frame_equal(expected, actual)
+
+    def test_excel_high_surrogate(self, read_ext):
+        # GH 23809
+
+        # should not produce a segmentation violation
+        pd.read_excel('high_surrogate.xlsx')