From 7e43c78a4e310955f3f214f58d1b77dc03c2ec0d Mon Sep 17 00:00:00 2001 From: "Graham R. Jeffries" Date: Mon, 27 Mar 2017 13:36:19 -0400 Subject: [PATCH 1/3] Remove NotImplementedError for parse_dates keyword in read_excel Rebase and update of PR https://github.com/pydata/pandas/pull/12051 Author: Joris Van den Bossche Author: Graham R. Jeffries This patch had conflicts when merged, resolved by Committer: Jeff Reback Closes #14326 from jorisvandenbossche/pr/12051 and squashes the following commits: 0b65a7a [Joris Van den Bossche] update wording 656ec44 [Joris Van den Bossche] Fix detection to raise warning b1c7f87 [Joris Van den Bossche] add whatsnew 925ce1b [Joris Van den Bossche] Update tests 0e10a9d [Graham R. Jeffries] remove read_excel kwd NotImplemented error, update documentation #11544 --- doc/source/io.rst | 14 +++++++++++++ doc/source/whatsnew/v0.19.0.txt | 4 ++++ pandas/io/excel.py | 9 +++------ pandas/tests/io/test_excel.py | 36 ++++++++++++++++++++------------- 4 files changed, 43 insertions(+), 20 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index faeea9d448cf2..e72224c6fa1fe 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2767,6 +2767,20 @@ indices to be parsed. read_excel('path_to_file.xls', 'Sheet1', parse_cols=[0, 2, 3]) + +Parsing Dates ++++++++++++++ + +Datetime-like values are normally automatically converted to the appropriate +dtype when reading the excel file. But if you have a column of strings that +*look* like dates (but are not actually formatted as dates in excel), you can +use the `parse_dates` keyword to parse those strings to datetimes: + +.. code-block:: python + + read_excel('path_to_file.xls', 'Sheet1', parse_dates=['date_strings']) + + Cell Converters +++++++++++++++ diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 9b003034aa94a..11df0afb144ea 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -517,6 +517,7 @@ Other enhancements - The ``pd.read_json`` and ``DataFrame.to_json`` has gained support for reading and writing json lines with ``lines`` option see :ref:`Line delimited json ` (:issue:`9180`) - :func:`read_excel` now supports the true_values and false_values keyword arguments (:issue:`13347`) - ``groupby()`` will now accept a scalar and a single-element list for specifying ``level`` on a non-``MultiIndex`` grouper. (:issue:`13907`) +<<<<<<< HEAD - Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception (:issue:`10001`). - ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`) - ``pd.read_stata()`` can now handle some format 111 files, which are produced by SAS when generating Stata dta files (:issue:`11526`) @@ -524,6 +525,9 @@ Other enhancements series or indices. This behaves like a standard binary operator with regards to broadcasting rules (:issue:`14208`). +======= +- Re-enable the ``parse_dates`` keyword of ``read_excel`` to parse string columns as dates (:issue:`14326`) +>>>>>>> PR_TOOL_MERGE_PR_14326 .. _whatsnew_0190.api: diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 82ea2e8a46592..e7a8b71a5f6c9 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -343,13 +343,10 @@ def _parse_excel(self, sheetname=0, header=0, skiprows=None, names=None, if 'chunksize' in kwds: raise NotImplementedError("chunksize keyword of read_excel " "is not implemented") - if parse_dates: - raise NotImplementedError("parse_dates keyword of read_excel " - "is not implemented") - if date_parser is not None: - raise NotImplementedError("date_parser keyword of read_excel " - "is not implemented") + if parse_dates is True and not index_col: + warn("The 'parse_dates=True' keyword of read_excel was provided" + " without an 'index_col' keyword value.") import xlrd from xlrd import (xldate, XL_CELL_DATE, diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index b66cb24bf44d8..df77708232dd2 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -924,17 +924,27 @@ def test_read_excel_chunksize(self): chunksize=100) def test_read_excel_parse_dates(self): - # GH 11544 - with tm.assertRaises(NotImplementedError): - pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext), - parse_dates=True) + # GH 11544, 12051 - def test_read_excel_date_parser(self): - # GH 11544 - with tm.assertRaises(NotImplementedError): - dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') - pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext), - date_parser=dateparse) + df = DataFrame( + {'col': [1, 2, 3], + 'date_strings': pd.date_range('2012-01-01', periods=3)}) + df2 = df.copy() + df2['date_strings'] = df2['date_strings'].dt.strftime('%m/%d/%Y') + + with ensure_clean(self.ext) as pth: + df2.to_excel(pth) + + res = read_excel(pth) + tm.assert_frame_equal(df2, res) + + res = read_excel(pth, parse_dates=['date_strings']) + tm.assert_frame_equal(df, res) + + dateparser = lambda x: pd.datetime.strptime(x, '%m/%d/%Y') + res = read_excel(pth, parse_dates=['date_strings'], + date_parser=dateparser) + tm.assert_frame_equal(df, res) def test_read_excel_skiprows_list(self): # GH 4903 @@ -1382,8 +1392,7 @@ def test_to_excel_multiindex(self): # round trip frame.to_excel(path, 'test1', merge_cells=self.merge_cells) reader = ExcelFile(path) - df = read_excel(reader, 'test1', index_col=[0, 1], - parse_dates=False) + df = read_excel(reader, 'test1', index_col=[0, 1]) tm.assert_frame_equal(frame, df) # GH13511 @@ -1424,8 +1433,7 @@ def test_to_excel_multiindex_cols(self): frame.to_excel(path, 'test1', merge_cells=self.merge_cells) reader = ExcelFile(path) df = read_excel(reader, 'test1', header=header, - index_col=[0, 1], - parse_dates=False) + index_col=[0, 1]) if not self.merge_cells: fm = frame.columns.format(sparsify=False, adjoin=False, names=False) From 5d5989876f5fc5841f2f7e37cd16e7630623fa80 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 27 Mar 2017 13:37:32 -0400 Subject: [PATCH 2/3] BUG: index_names can be None when processing date conversions --- doc/source/whatsnew/v0.19.0.txt | 4 ---- doc/source/whatsnew/v0.20.0.txt | 3 ++- pandas/io/excel.py | 2 +- pandas/io/parsers.py | 13 ++++++++++--- pandas/tests/io/test_excel.py | 5 +++-- 5 files changed, 16 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 11df0afb144ea..9b003034aa94a 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -517,7 +517,6 @@ Other enhancements - The ``pd.read_json`` and ``DataFrame.to_json`` has gained support for reading and writing json lines with ``lines`` option see :ref:`Line delimited json ` (:issue:`9180`) - :func:`read_excel` now supports the true_values and false_values keyword arguments (:issue:`13347`) - ``groupby()`` will now accept a scalar and a single-element list for specifying ``level`` on a non-``MultiIndex`` grouper. (:issue:`13907`) -<<<<<<< HEAD - Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception (:issue:`10001`). - ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`) - ``pd.read_stata()`` can now handle some format 111 files, which are produced by SAS when generating Stata dta files (:issue:`11526`) @@ -525,9 +524,6 @@ Other enhancements series or indices. This behaves like a standard binary operator with regards to broadcasting rules (:issue:`14208`). -======= -- Re-enable the ``parse_dates`` keyword of ``read_excel`` to parse string columns as dates (:issue:`14326`) ->>>>>>> PR_TOOL_MERGE_PR_14326 .. _whatsnew_0190.api: diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 3ab69e1ff409b..fdf34e0d11572 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -270,7 +270,7 @@ To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you .. _whatsnew_0200.enhancements.other: -Other enhancements +Other Enhancements ^^^^^^^^^^^^^^^^^^ - Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here `. @@ -314,6 +314,7 @@ Other enhancements - ``pd.types.concat.union_categoricals`` gained the ``ignore_ordered`` argument to allow ignoring the ordered attribute of unioned categoricals (:issue:`13410`). See the :ref:`categorical union docs ` for more information. - ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`) - ``pd.DataFrame.to_latex`` and ``pd.DataFrame.to_string`` now allow optional header aliases. (:issue:`15536`) +- Re-enable the ``parse_dates`` keyword of ``read_excel`` to parse string columns as dates (:issue:`14326`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations diff --git a/pandas/io/excel.py b/pandas/io/excel.py index e7a8b71a5f6c9..d324855bc2f4d 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -344,7 +344,7 @@ def _parse_excel(self, sheetname=0, header=0, skiprows=None, names=None, raise NotImplementedError("chunksize keyword of read_excel " "is not implemented") - if parse_dates is True and not index_col: + if parse_dates is True and index_col is None: warn("The 'parse_dates=True' keyword of read_excel was provided" " without an 'index_col' keyword value.") diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 45c62b224ef4e..30b88de91ef76 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1176,13 +1176,18 @@ def _should_parse_dates(self, i): if isinstance(self.parse_dates, bool): return self.parse_dates else: - name = self.index_names[i] + if self.index_names is not None: + name = self.index_names[i] + else: + name = None j = self.index_col[i] if is_scalar(self.parse_dates): - return (j == self.parse_dates) or (name == self.parse_dates) + return ((j == self.parse_dates) or + (name is not None and name == self.parse_dates)) else: - return (j in self.parse_dates) or (name in self.parse_dates) + return ((j in self.parse_dates) or + (name is not None and name in self.parse_dates)) def _extract_multi_indexer_columns(self, header, index_names, col_names, passed_names=False): @@ -1352,6 +1357,7 @@ def _get_name(icol): def _agg_index(self, index, try_parse_dates=True): arrays = [] + for i, arr in enumerate(index): if (try_parse_dates and self._should_parse_dates(i)): @@ -1512,6 +1518,7 @@ def _cast_types(self, values, cast_type, column): def _do_date_conversions(self, names, data): # returns data, columns + if self.parse_dates is not None: data, names = _process_date_conversion( data, self._date_conv, self.parse_dates, self.index_col, diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index df77708232dd2..87e87bc4aba65 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -935,15 +935,16 @@ def test_read_excel_parse_dates(self): with ensure_clean(self.ext) as pth: df2.to_excel(pth) + # no index_col specified res = read_excel(pth) tm.assert_frame_equal(df2, res) - res = read_excel(pth, parse_dates=['date_strings']) + res = read_excel(pth, parse_dates=['date_strings'], index_col=0) tm.assert_frame_equal(df, res) dateparser = lambda x: pd.datetime.strptime(x, '%m/%d/%Y') res = read_excel(pth, parse_dates=['date_strings'], - date_parser=dateparser) + date_parser=dateparser, index_col=0) tm.assert_frame_equal(df, res) def test_read_excel_skiprows_list(self): From a1eee6789df1ffbc171704f79bb7438555f398c9 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 27 Mar 2017 15:46:09 -0400 Subject: [PATCH 3/3] test for warning --- pandas/tests/io/test_excel.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 87e87bc4aba65..256a37e922177 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -935,10 +935,14 @@ def test_read_excel_parse_dates(self): with ensure_clean(self.ext) as pth: df2.to_excel(pth) - # no index_col specified res = read_excel(pth) tm.assert_frame_equal(df2, res) + # no index_col specified when parse_dates is True + with tm.assert_produces_warning(): + res = read_excel(pth, parse_dates=True) + tm.assert_frame_equal(df2, res) + res = read_excel(pth, parse_dates=['date_strings'], index_col=0) tm.assert_frame_equal(df, res)