Skip to content

Commit c6551f6

Browse files
committed
Test for GH43540
1 parent 2d269ae commit c6551f6

File tree

1 file changed

+17
-0
lines changed

1 file changed

+17
-0
lines changed

pandas/tests/io/test_common.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,23 @@ def test_warning_missing_utf_bom(self, encoding, compression_):
497497
with pytest.raises(UnicodeError, match=msg):
498498
pd.read_csv(path, compression=compression_, encoding=encoding)
499499

500+
def test_chunk_splits_multibyte_char(self):
501+
"""
502+
Chunk splits a multibyte character with memory_map=True
503+
504+
GH 43540
505+
"""
506+
# DEFAULT_CHUNKSIZE = 262144, defined in parsers.pyx
507+
df = pd.DataFrame(data=["a" * 127] * 2048)
508+
509+
# Put two-bytes utf-8 encoded character "ą" at the end of chunk
510+
# utf-8 encoding of "ą" is b'\xc4\x85'
511+
df.iloc[2047] = "a" * 127 + "ą"
512+
df.to_csv("data/bug-gh43540.csv", index=False, header=False, encoding="utf-8")
513+
dfr = pd.read_csv("data/bug-gh43540.csv", header=None, memory_map=True)
514+
tm.ensure_clean("data/bug-gh43540.csv")
515+
assert (dfr == df).all()[0]
516+
500517

501518
def test_is_fsspec_url():
502519
assert icom.is_fsspec_url("gcs://pandas/somethingelse.com")

0 commit comments

Comments
 (0)