Skip to content

Commit b6bb3f9

Browse files
Add extra check for failing UTF-8 conversion
fixes #23809 Unit test added
1 parent edb863e commit b6bb3f9

File tree

3 files changed

+10
-1
lines changed

3 files changed

+10
-1
lines changed

pandas/_libs/src/parse_helper.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ int floatify(PyObject *str, double *result, int *maybe_int) {
3434
data = PyBytes_AS_STRING(str);
3535
} else if (PyUnicode_Check(str)) {
3636
tmp = PyUnicode_AsUTF8String(str);
37+
if (tmp == NULL) {
38+
PyErr_SetString(PyExc_TypeError, "UTF8 decode error");
39+
return -1;
40+
}
3741
data = PyBytes_AS_STRING(tmp);
3842
} else {
3943
PyErr_SetString(PyExc_TypeError, "Invalid object type");
10.1 KB
Binary file not shown.

pandas/tests/io/excel/test_readers.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -883,7 +883,6 @@ def test_read_excel_squeeze(self, read_ext):
883883
expected = pd.Series([1, 2, 3], name="a")
884884
tm.assert_series_equal(actual, expected)
885885

886-
887886
class TestExcelFileRead:
888887
@pytest.fixture(autouse=True)
889888
def cd_and_set_engine(self, engine, datapath, monkeypatch):
@@ -1044,3 +1043,9 @@ def test_excel_read_binary(self, engine, read_ext):
10441043

10451044
actual = pd.read_excel(data, engine=engine)
10461045
tm.assert_frame_equal(expected, actual)
1046+
1047+
def test_excel_high_surrogate(self, read_ext):
1048+
# GH 23809
1049+
1050+
# should not produce a segmentation violation
1051+
pd.read_excel('high_surrogate.xlsx')

0 commit comments

Comments
 (0)