diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 8a857c033a2de..28ffe316e21dc 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -24,18 +24,6 @@ from pandas.io.common import get_handle -MIXED_FLOAT_DTYPES = ["float16", "float32", "float64"] -MIXED_INT_DTYPES = [ - "uint8", - "uint16", - "uint32", - "uint64", - "int8", - "int16", - "int32", - "int64", -] - class TestDataFrameToCSV: def read_csv(self, path, **kwargs): @@ -178,46 +166,40 @@ def test_to_csv_cols_reordering(self): tm.assert_frame_equal(df[cols], rs_c, check_names=False) - def test_to_csv_new_dupe_cols(self): - def _check_df(df, cols=None): - with tm.ensure_clean() as path: - df.to_csv(path, columns=cols, chunksize=chunksize) - rs_c = read_csv(path, index_col=0) - - # we wrote them in a different order - # so compare them in that order - if cols is not None: - - if df.columns.is_unique: - rs_c.columns = cols - else: - indexer, missing = df.columns.get_indexer_non_unique(cols) - rs_c.columns = df.columns.take(indexer) - - for c in cols: - obj_df = df[c] - obj_rs = rs_c[c] - if isinstance(obj_df, Series): - tm.assert_series_equal(obj_df, obj_rs) - else: - tm.assert_frame_equal(obj_df, obj_rs, check_names=False) - - # wrote in the same order - else: - rs_c.columns = df.columns - tm.assert_frame_equal(df, rs_c, check_names=False) - + @pytest.mark.parametrize("cols", [None, ["b", "a"]]) + def test_to_csv_new_dupe_cols(self, cols): chunksize = 5 N = int(chunksize * 2.5) # dupe cols df = tm.makeCustomDataframe(N, 3) df.columns = ["a", "a", "b"] - _check_df(df, None) + with tm.ensure_clean() as path: + df.to_csv(path, columns=cols, chunksize=chunksize) + rs_c = read_csv(path, index_col=0) - # dupe cols with selection - cols = ["b", "a"] - _check_df(df, cols) + # we wrote them in a different order + # so compare them in that order + if cols is not None: + + if df.columns.is_unique: + rs_c.columns = cols + else: + indexer, missing = df.columns.get_indexer_non_unique(cols) + rs_c.columns = df.columns.take(indexer) + + for c in cols: + obj_df = df[c] + obj_rs = rs_c[c] + if isinstance(obj_df, Series): + tm.assert_series_equal(obj_df, obj_rs) + else: + tm.assert_frame_equal(obj_df, obj_rs, check_names=False) + + # wrote in the same order + else: + rs_c.columns = df.columns + tm.assert_frame_equal(df, rs_c, check_names=False) @pytest.mark.slow def test_to_csv_dtnat(self): @@ -235,11 +217,9 @@ def make_dtnat_arr(n, nnat=None): return s chunksize = 1000 - # N=35000 s1 = make_dtnat_arr(chunksize + 5) s2 = make_dtnat_arr(chunksize + 5, 0) - # s3=make_dtnjat_arr(chunksize+5,0) with tm.ensure_clean("1.csv") as pth: df = DataFrame({"a": s1, "b": s2}) df.to_csv(pth, chunksize=chunksize) @@ -247,224 +227,195 @@ def make_dtnat_arr(n, nnat=None): recons = self.read_csv(pth).apply(to_datetime) tm.assert_frame_equal(df, recons, check_names=False) - @pytest.mark.slow - def test_to_csv_moar(self): - def _do_test( - df, r_dtype=None, c_dtype=None, rnlvl=None, cnlvl=None, dupe_col=False - ): - - kwargs = {"parse_dates": False} - if cnlvl: - if rnlvl is not None: - kwargs["index_col"] = list(range(rnlvl)) - kwargs["header"] = list(range(cnlvl)) - - with tm.ensure_clean("__tmp_to_csv_moar__") as path: - df.to_csv(path, encoding="utf8", chunksize=chunksize) - recons = self.read_csv(path, **kwargs) + def _return_result_expected( + self, + df, + chunksize, + r_dtype=None, + c_dtype=None, + rnlvl=None, + cnlvl=None, + dupe_col=False, + ): + kwargs = {"parse_dates": False} + if cnlvl: + if rnlvl is not None: + kwargs["index_col"] = list(range(rnlvl)) + kwargs["header"] = list(range(cnlvl)) + + with tm.ensure_clean("__tmp_to_csv_moar__") as path: + df.to_csv(path, encoding="utf8", chunksize=chunksize) + recons = self.read_csv(path, **kwargs) + else: + kwargs["header"] = 0 + + with tm.ensure_clean("__tmp_to_csv_moar__") as path: + df.to_csv(path, encoding="utf8", chunksize=chunksize) + recons = self.read_csv(path, **kwargs) + + def _to_uni(x): + if not isinstance(x, str): + return x.decode("utf8") + return x + + if dupe_col: + # read_Csv disambiguates the columns by + # labeling them dupe.1,dupe.2, etc'. monkey patch columns + recons.columns = df.columns + if rnlvl and not cnlvl: + delta_lvl = [recons.iloc[:, i].values for i in range(rnlvl - 1)] + ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl) + recons.index = ix + recons = recons.iloc[:, rnlvl - 1 :] + + type_map = {"i": "i", "f": "f", "s": "O", "u": "O", "dt": "O", "p": "O"} + if r_dtype: + if r_dtype == "u": # unicode + r_dtype = "O" + recons.index = np.array( + [_to_uni(label) for label in recons.index], dtype=r_dtype + ) + df.index = np.array( + [_to_uni(label) for label in df.index], dtype=r_dtype + ) + elif r_dtype == "dt": # unicode + r_dtype = "O" + recons.index = np.array( + [Timestamp(label) for label in recons.index], dtype=r_dtype + ) + df.index = np.array( + [Timestamp(label) for label in df.index], dtype=r_dtype + ) + elif r_dtype == "p": + r_dtype = "O" + idx_list = to_datetime(recons.index) + recons.index = np.array( + [Timestamp(label) for label in idx_list], dtype=r_dtype + ) + df.index = np.array( + list(map(Timestamp, df.index.to_timestamp())), dtype=r_dtype + ) else: - kwargs["header"] = 0 - - with tm.ensure_clean("__tmp_to_csv_moar__") as path: - df.to_csv(path, encoding="utf8", chunksize=chunksize) - recons = self.read_csv(path, **kwargs) - - def _to_uni(x): - if not isinstance(x, str): - return x.decode("utf8") - return x - - if dupe_col: - # read_Csv disambiguates the columns by - # labeling them dupe.1,dupe.2, etc'. monkey patch columns - recons.columns = df.columns - if rnlvl and not cnlvl: - delta_lvl = [recons.iloc[:, i].values for i in range(rnlvl - 1)] - ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl) - recons.index = ix - recons = recons.iloc[:, rnlvl - 1 :] - - type_map = {"i": "i", "f": "f", "s": "O", "u": "O", "dt": "O", "p": "O"} - if r_dtype: - if r_dtype == "u": # unicode - r_dtype = "O" - recons.index = np.array( - [_to_uni(label) for label in recons.index], dtype=r_dtype - ) - df.index = np.array( - [_to_uni(label) for label in df.index], dtype=r_dtype - ) - elif r_dtype == "dt": # unicode - r_dtype = "O" - recons.index = np.array( - [Timestamp(label) for label in recons.index], dtype=r_dtype - ) - df.index = np.array( - [Timestamp(label) for label in df.index], dtype=r_dtype - ) - elif r_dtype == "p": - r_dtype = "O" - idx_list = to_datetime(recons.index) - recons.index = np.array( - [Timestamp(label) for label in idx_list], dtype=r_dtype - ) - df.index = np.array( - list(map(Timestamp, df.index.to_timestamp())), dtype=r_dtype - ) - else: - r_dtype = type_map.get(r_dtype) - recons.index = np.array(recons.index, dtype=r_dtype) - df.index = np.array(df.index, dtype=r_dtype) - if c_dtype: - if c_dtype == "u": - c_dtype = "O" - recons.columns = np.array( - [_to_uni(label) for label in recons.columns], dtype=c_dtype - ) - df.columns = np.array( - [_to_uni(label) for label in df.columns], dtype=c_dtype - ) - elif c_dtype == "dt": - c_dtype = "O" - recons.columns = np.array( - [Timestamp(label) for label in recons.columns], dtype=c_dtype - ) - df.columns = np.array( - [Timestamp(label) for label in df.columns], dtype=c_dtype - ) - elif c_dtype == "p": - c_dtype = "O" - col_list = to_datetime(recons.columns) - recons.columns = np.array( - [Timestamp(label) for label in col_list], dtype=c_dtype - ) - col_list = df.columns.to_timestamp() - df.columns = np.array( - [Timestamp(label) for label in col_list], dtype=c_dtype - ) - else: - c_dtype = type_map.get(c_dtype) - recons.columns = np.array(recons.columns, dtype=c_dtype) - df.columns = np.array(df.columns, dtype=c_dtype) + r_dtype = type_map.get(r_dtype) + recons.index = np.array(recons.index, dtype=r_dtype) + df.index = np.array(df.index, dtype=r_dtype) + if c_dtype: + if c_dtype == "u": + c_dtype = "O" + recons.columns = np.array( + [_to_uni(label) for label in recons.columns], dtype=c_dtype + ) + df.columns = np.array( + [_to_uni(label) for label in df.columns], dtype=c_dtype + ) + elif c_dtype == "dt": + c_dtype = "O" + recons.columns = np.array( + [Timestamp(label) for label in recons.columns], dtype=c_dtype + ) + df.columns = np.array( + [Timestamp(label) for label in df.columns], dtype=c_dtype + ) + elif c_dtype == "p": + c_dtype = "O" + col_list = to_datetime(recons.columns) + recons.columns = np.array( + [Timestamp(label) for label in col_list], dtype=c_dtype + ) + col_list = df.columns.to_timestamp() + df.columns = np.array( + [Timestamp(label) for label in col_list], dtype=c_dtype + ) + else: + c_dtype = type_map.get(c_dtype) + recons.columns = np.array(recons.columns, dtype=c_dtype) + df.columns = np.array(df.columns, dtype=c_dtype) + return df, recons - tm.assert_frame_equal(df, recons, check_names=False) + @pytest.mark.slow + @pytest.mark.parametrize( + "nrows", [2, 10, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251] + ) + def test_to_csv_nrows(self, nrows): + df = tm.makeCustomDataframe(nrows, 4, r_idx_type="dt", c_idx_type="s") + result, expected = self._return_result_expected(df, 1000, "dt", "s") + tm.assert_frame_equal(result, expected, check_names=False) + + @pytest.mark.slow + @pytest.mark.parametrize( + "nrows", [2, 10, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251] + ) + @pytest.mark.parametrize( + "r_idx_type, c_idx_type", [("i", "i"), ("s", "s"), ("u", "dt"), ("p", "p")] + ) + @pytest.mark.parametrize("ncols", [1, 2, 3, 4]) + def test_to_csv_idx_types(self, nrows, r_idx_type, c_idx_type, ncols): + df = tm.makeCustomDataframe( + nrows, ncols, r_idx_type=r_idx_type, c_idx_type=c_idx_type + ) + result, expected = self._return_result_expected( + df, + 1000, + r_idx_type, + c_idx_type, + ) + tm.assert_frame_equal(result, expected, check_names=False) + + @pytest.mark.slow + @pytest.mark.parametrize( + "nrows", [10, 98, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251] + ) + @pytest.mark.parametrize("ncols", [1, 2, 3, 4]) + def test_to_csv_idx_ncols(self, nrows, ncols): + df = tm.makeCustomDataframe(nrows, ncols) + result, expected = self._return_result_expected(df, 1000) + tm.assert_frame_equal(result, expected, check_names=False) + + @pytest.mark.slow + @pytest.mark.parametrize("nrows", [10, 98, 99, 100, 101, 102]) + def test_to_csv_dup_cols(self, nrows): + df = tm.makeCustomDataframe(nrows, 3) + cols = list(df.columns) + cols[:2] = ["dupe", "dupe"] + cols[-2:] = ["dupe", "dupe"] + ix = list(df.index) + ix[:2] = ["rdupe", "rdupe"] + ix[-2:] = ["rdupe", "rdupe"] + df.index = ix + df.columns = cols + result, expected = self._return_result_expected(df, 1000, dupe_col=True) + tm.assert_frame_equal(result, expected, check_names=False) - N = 100 + @pytest.mark.slow + def test_to_csv_empty(self): + df = DataFrame(index=np.arange(10)) + result, expected = self._return_result_expected(df, 1000) + tm.assert_frame_equal(result, expected, check_names=False) + + @pytest.mark.slow + def test_to_csv_chunksize(self): chunksize = 1000 - ncols = 4 - base = chunksize // ncols - for nrows in [ - 2, - 10, - N - 1, - N, - N + 1, - N + 2, - 2 * N - 2, - 2 * N - 1, - 2 * N, - 2 * N + 1, - 2 * N + 2, - base - 1, - base, - base + 1, - ]: - _do_test( - tm.makeCustomDataframe(nrows, ncols, r_idx_type="dt", c_idx_type="s"), - "dt", - "s", - ) + df = tm.makeCustomDataframe(chunksize // 2 + 1, 2, r_idx_nlevels=2) + result, expected = self._return_result_expected(df, chunksize, rnlvl=2) + tm.assert_frame_equal(result, expected, check_names=False) - for r_idx_type, c_idx_type in [("i", "i"), ("s", "s"), ("u", "dt"), ("p", "p")]: - for ncols in [1, 2, 3, 4]: - base = chunksize // ncols - for nrows in [ - 2, - 10, - N - 1, - N, - N + 1, - N + 2, - 2 * N - 2, - 2 * N - 1, - 2 * N, - 2 * N + 1, - 2 * N + 2, - base - 1, - base, - base + 1, - ]: - _do_test( - tm.makeCustomDataframe( - nrows, ncols, r_idx_type=r_idx_type, c_idx_type=c_idx_type - ), - r_idx_type, - c_idx_type, - ) - - for ncols in [1, 2, 3, 4]: - base = chunksize // ncols - for nrows in [ - 10, - N - 2, - N - 1, - N, - N + 1, - N + 2, - 2 * N - 2, - 2 * N - 1, - 2 * N, - 2 * N + 1, - 2 * N + 2, - base - 1, - base, - base + 1, - ]: - _do_test(tm.makeCustomDataframe(nrows, ncols)) - - for nrows in [10, N - 2, N - 1, N, N + 1, N + 2]: - df = tm.makeCustomDataframe(nrows, 3) - cols = list(df.columns) - cols[:2] = ["dupe", "dupe"] - cols[-2:] = ["dupe", "dupe"] - ix = list(df.index) - ix[:2] = ["rdupe", "rdupe"] - ix[-2:] = ["rdupe", "rdupe"] - df.index = ix - df.columns = cols - _do_test(df, dupe_col=True) - - _do_test(DataFrame(index=np.arange(10))) - _do_test( - tm.makeCustomDataframe(chunksize // 2 + 1, 2, r_idx_nlevels=2), rnlvl=2 - ) - for ncols in [2, 3, 4]: - base = int(chunksize // ncols) - for nrows in [ - 10, - N - 2, - N - 1, - N, - N + 1, - N + 2, - 2 * N - 2, - 2 * N - 1, - 2 * N, - 2 * N + 1, - 2 * N + 2, - base - 1, - base, - base + 1, - ]: - _do_test(tm.makeCustomDataframe(nrows, ncols, r_idx_nlevels=2), rnlvl=2) - _do_test(tm.makeCustomDataframe(nrows, ncols, c_idx_nlevels=2), cnlvl=2) - _do_test( - tm.makeCustomDataframe( - nrows, ncols, r_idx_nlevels=2, c_idx_nlevels=2 - ), - rnlvl=2, - cnlvl=2, - ) + @pytest.mark.slow + @pytest.mark.parametrize( + "nrows", [2, 10, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251] + ) + @pytest.mark.parametrize("ncols", [2, 3, 4]) + @pytest.mark.parametrize( + "df_params, func_params", + [ + [{"r_idx_nlevels": 2}, {"rnlvl": 2}], + [{"c_idx_nlevels": 2}, {"cnlvl": 2}], + [{"r_idx_nlevels": 2, "c_idx_nlevels": 2}, {"rnlvl": 2, "cnlvl": 2}], + ], + ) + def test_to_csv_params(self, nrows, df_params, func_params, ncols): + df = tm.makeCustomDataframe(nrows, ncols, **df_params) + result, expected = self._return_result_expected(df, 1000, **func_params) + tm.assert_frame_equal(result, expected, check_names=False) def test_to_csv_from_csv_w_some_infs(self, float_frame): @@ -794,18 +745,18 @@ def test_to_csv_dups_cols(self): result = result.rename(columns={"a.1": "a"}) tm.assert_frame_equal(result, df) - def test_to_csv_chunking(self): + @pytest.mark.parametrize("chunksize", [10000, 50000, 100000]) + def test_to_csv_chunking(self, chunksize): aa = DataFrame({"A": range(100000)}) aa["B"] = aa.A + 1.0 aa["C"] = aa.A + 2.0 aa["D"] = aa.A + 3.0 - for chunksize in [10000, 50000, 100000]: - with tm.ensure_clean() as filename: - aa.to_csv(filename, chunksize=chunksize) - rs = read_csv(filename, index_col=0) - tm.assert_frame_equal(rs, aa) + with tm.ensure_clean() as filename: + aa.to_csv(filename, chunksize=chunksize) + rs = read_csv(filename, index_col=0) + tm.assert_frame_equal(rs, aa) @pytest.mark.slow def test_to_csv_wide_frame_formatting(self): @@ -893,17 +844,17 @@ def test_to_csv_unicodewriter_quoting(self): expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected - def test_to_csv_quote_none(self): + @pytest.mark.parametrize("encoding", [None, "utf-8"]) + def test_to_csv_quote_none(self, encoding): # GH4328 df = DataFrame({"A": ["hello", '{"hello"}']}) - for encoding in (None, "utf-8"): - buf = StringIO() - df.to_csv(buf, quoting=csv.QUOTE_NONE, encoding=encoding, index=False) + buf = StringIO() + df.to_csv(buf, quoting=csv.QUOTE_NONE, encoding=encoding, index=False) - result = buf.getvalue() - expected_rows = ["A", "hello", '{"hello"}'] - expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert result == expected + result = buf.getvalue() + expected_rows = ["A", "hello", '{"hello"}'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected def test_to_csv_index_no_leading_comma(self): df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["one", "two", "three"]) @@ -1102,7 +1053,8 @@ def test_to_csv_date_format(self, datetime_frame): tm.assert_frame_equal(test, nat_frame) - def test_to_csv_with_dst_transitions(self): + @pytest.mark.parametrize("td", [pd.Timedelta(0), pd.Timedelta("10s")]) + def test_to_csv_with_dst_transitions(self, td): with tm.ensure_clean("csv_date_format_with_dst") as path: # make sure we are not failing on transitions @@ -1113,20 +1065,20 @@ def test_to_csv_with_dst_transitions(self): freq="H", ambiguous="infer", ) + i = times + td + i = i._with_freq(None) # freq is not preserved by read_csv + time_range = np.array(range(len(i)), dtype="int64") + df = DataFrame({"A": time_range}, index=i) + df.to_csv(path, index=True) + # we have to reconvert the index as we + # don't parse the tz's + result = read_csv(path, index_col=0) + result.index = to_datetime(result.index, utc=True).tz_convert( + "Europe/London" + ) + tm.assert_frame_equal(result, df) - for i in [times, times + pd.Timedelta("10s")]: - i = i._with_freq(None) # freq is not preserved by read_csv - time_range = np.array(range(len(i)), dtype="int64") - df = DataFrame({"A": time_range}, index=i) - df.to_csv(path, index=True) - # we have to reconvert the index as we - # don't parse the tz's - result = read_csv(path, index_col=0) - result.index = to_datetime(result.index, utc=True).tz_convert( - "Europe/London" - ) - tm.assert_frame_equal(result, df) - + def test_to_csv_with_dst_transitions_with_pickle(self): # GH11619 idx = date_range("2015-01-01", "2015-12-31", freq="H", tz="Europe/Paris") idx = idx._with_freq(None) # freq does not round-trip