From ba4e3c0bb54501db3aecc16232f758ce643e31a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Fri, 5 Feb 2021 09:53:45 -0500 Subject: [PATCH 1/2] REGR: appending to existing excel file created corrupt files --- doc/source/whatsnew/v1.2.2.rst | 1 + pandas/io/excel/_openpyxl.py | 2 ++ pandas/tests/io/excel/test_openpyxl.py | 20 ++++++++++++++++++++ 3 files changed, 23 insertions(+) diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index cc5653fe2f360..a67cade1c00fe 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -21,6 +21,7 @@ Fixed regressions - Fixed regression in :meth:`~DataFrame.to_pickle` failing to create bz2/xz compressed pickle files with ``protocol=5`` (:issue:`39002`) - Fixed regression in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` always raising ``AssertionError`` when comparing extension dtypes (:issue:`39410`) - Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`) +- Fixed regression in :meth:`~DataFrame.to_excel` creating corrupt files when appending (``mode="a"``) to an existing file (:issue:`39576`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 64c64b5009b0c..a9442080f0b32 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -40,6 +40,8 @@ def __init__( from openpyxl import load_workbook self.book = load_workbook(self.handles.handle) + self.handles.handle.seek(0) + self.handles.handle.truncate() else: # Create workbook object with default optimized_write=True. self.book = Workbook() diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 640501baffc62..b365f4edab83c 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -1,4 +1,5 @@ from distutils.version import LooseVersion +from pathlib import Path import numpy as np import pytest @@ -149,3 +150,22 @@ def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename) result = pd.read_excel(path, header=header) expected = DataFrame(expected_data) tm.assert_frame_equal(result, expected) + + +def test_append_mode_file(ext): + # GH 39576 + df = DataFrame() + + with tm.ensure_clean(ext) as f: + df.to_excel(f, engine="openpyxl") + + with ExcelWriter(f, mode="a", engine="openpyxl") as writer: + df.to_excel(writer) + + # make sure that zip files are not concatenated by making sure that + # "docProps/app.xml" only occurs twice in the file + data = Path(f).read_bytes() + first = data.find(b"docProps/app.xml") + second = data.find(b"docProps/app.xml", first + 1) + third = data.find(b"docProps/app.xml", second + 1) + assert second != -1 and third == -1 From 8bf62d417fc4faa2079cd3e8197fbecd8d87d6c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Fri, 5 Feb 2021 15:21:35 -0500 Subject: [PATCH 2/2] truncate after writing --- pandas/io/excel/_openpyxl.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index a9442080f0b32..b53db6c726c4d 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -1,6 +1,7 @@ from __future__ import annotations from distutils.version import LooseVersion +import mmap from typing import TYPE_CHECKING, Dict, List, Optional import numpy as np @@ -41,7 +42,6 @@ def __init__( self.book = load_workbook(self.handles.handle) self.handles.handle.seek(0) - self.handles.handle.truncate() else: # Create workbook object with default optimized_write=True. self.book = Workbook() @@ -54,6 +54,9 @@ def save(self): Save workbook to disk. """ self.book.save(self.handles.handle) + if "r+" in self.mode and not isinstance(self.handles.handle, mmap.mmap): + # truncate file to the written content + self.handles.handle.truncate() @classmethod def _convert_to_style_kwargs(cls, style_dict: dict) -> Dict[str, Serialisable]: