From 381e3b39268fb0a7da9df43622306b998dd98045 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Sun, 17 Jul 2016 16:56:37 +0200 Subject: [PATCH] BUG: provide chunks with progressively numbered (default) indices --- doc/source/whatsnew/v0.19.0.txt | 34 ++++++++++++++++++++++++++ pandas/io/parsers.py | 15 +++++++++++- pandas/io/tests/parser/common.py | 12 +++++++++ pandas/io/tests/parser/test_network.py | 4 --- pandas/io/tests/test_common.py | 1 - 5 files changed, 60 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 317383e866464..5cc17216fd8f2 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -596,6 +596,40 @@ New Behavior: idx1.difference(idx2) idx1.symmetric_difference(idx2) +.. _whatsnew_0190.api.autogenerated_chunksize_index: + +:func:`read_csv` called with ``chunksize`` parameter generates correct index +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When :func:`read_csv` is called with ``chunksize='n'`` and without specifying an index, +each chunk used to have an independently generated index from `0`` to ``n-1``. +They are now given instead a progressive index, starting from ``0`` for the first chunk, +from ``n`` for the second, and so on, so that, when concatenated, they are identical to +the result of calling :func:`read_csv` without the ``chunksize=`` argument. +(:issue:`12185`) + +.. ipython :: python + + data = 'A,B\n0,1\n2,3\n4,5\n6,7' + +Previous behaviour: + +.. code-block:: ipython + + In [2]: pd.concat(pd.read_csv(StringIO(data), chunksize=2)) + Out[2]: + A B + 0 0 1 + 1 2 3 + 0 4 5 + 1 6 7 + +New behaviour: + +.. ipython :: python + + pd.concat(pd.read_csv(StringIO(data), chunksize=2)) + .. _whatsnew_0190.deprecations: Deprecations diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f6a84ea9debaa..34c83c01dc01b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -16,7 +16,7 @@ is_list_like, is_integer_dtype, is_float, is_scalar) -from pandas.core.index import Index, MultiIndex +from pandas.core.index import Index, MultiIndex, RangeIndex from pandas.core.frame import DataFrame from pandas.core.common import AbstractMethodError from pandas.core.config import get_option @@ -700,6 +700,7 @@ def __init__(self, f, engine=None, **kwds): # miscellanea self.engine = engine self._engine = None + self._currow = 0 options = self._get_options_with_defaults(engine) @@ -913,8 +914,20 @@ def read(self, nrows=None): # May alter columns / col_dict index, columns, col_dict = self._create_index(ret) + if index is None: + if col_dict: + # Any column is actually fine: + new_rows = len(compat.next(compat.itervalues(col_dict))) + index = RangeIndex(self._currow, self._currow + new_rows) + else: + new_rows = 0 + else: + new_rows = len(index) + df = DataFrame(col_dict, columns=columns, index=index) + self._currow += new_rows + if self.squeeze and len(df.columns) == 1: return df[df.columns[0]].copy() return df diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 11eed79e03267..f3adb0e39982c 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -461,6 +461,18 @@ def test_get_chunk_passed_chunksize(self): piece = result.get_chunk() self.assertEqual(len(piece), 2) + def test_read_chunksize_generated_index(self): + # GH 12185 + reader = self.read_csv(StringIO(self.data1), chunksize=2) + df = self.read_csv(StringIO(self.data1)) + + tm.assert_frame_equal(pd.concat(reader), df) + + reader = self.read_csv(StringIO(self.data1), chunksize=2, index_col=0) + df = self.read_csv(StringIO(self.data1), index_col=0) + + tm.assert_frame_equal(pd.concat(reader), df) + def test_read_text_list(self): data = """A,B,C\nfoo,1,2,3\nbar,4,5,6""" as_list = [['A', 'B', 'C'], ['foo', '1', '2', '3'], ['bar', diff --git a/pandas/io/tests/parser/test_network.py b/pandas/io/tests/parser/test_network.py index d5370db4b55db..8b8a6de36fc03 100644 --- a/pandas/io/tests/parser/test_network.py +++ b/pandas/io/tests/parser/test_network.py @@ -122,8 +122,6 @@ def test_parse_public_s3_bucket_chunked(self): self.assertFalse(df.empty) true_df = local_tips.iloc[ chunksize * i_chunk: chunksize * (i_chunk + 1)] - # Chunking doesn't preserve row numbering - true_df = true_df.reset_index().drop('index', axis=1) tm.assert_frame_equal(true_df, df) @tm.network @@ -143,8 +141,6 @@ def test_parse_public_s3_bucket_chunked_python(self): self.assertFalse(df.empty) true_df = local_tips.iloc[ chunksize * i_chunk: chunksize * (i_chunk + 1)] - # Chunking doesn't preserve row numbering - true_df = true_df.reset_index().drop('index', axis=1) tm.assert_frame_equal(true_df, df) @tm.network diff --git a/pandas/io/tests/test_common.py b/pandas/io/tests/test_common.py index 0acf3244fe8fa..a443df5dac586 100644 --- a/pandas/io/tests/test_common.py +++ b/pandas/io/tests/test_common.py @@ -86,7 +86,6 @@ def test_iterator(self): it = read_csv(StringIO(self.data1), chunksize=1) first = next(it) tm.assert_frame_equal(first, expected.iloc[[0]]) - expected.index = [0 for i in range(len(expected))] tm.assert_frame_equal(concat(it), expected.iloc[1:])