diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 88f491ecc0bb0..65d079703e243 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -130,11 +130,10 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - `read_sql` and `to_sql` can accept database URI as con parameter (:issue:`10214`) - - Enable `read_hdf` to be used without specifying a key when the HDF file contains a single dataset (:issue:`10443`) - +- Enable writing Excel files in :ref:`memory <_io.excel_writing_buffer>` using StringIO/BytesIO (:issue:`7074`) +- Enable serialization of lists and dicts to strings in ExcelWriter (:issue:`8188`) - Added functionality to use the ``base`` argument when resampling a ``TimeDeltaIndex`` (:issue:`10530`) - - ``DatetimeIndex`` can be instantiated using strings contains ``NaT`` (:issue:`7599`) - The string parsing of ``to_datetime``, ``Timestamp`` and ``DatetimeIndex`` has been made consistent. (:issue:`7599`) @@ -235,7 +234,7 @@ Changes to sorting API The sorting API has had some longtime inconsistencies. (:issue:`9816`, :issue:`8239`). -Here is a summary of the **PRIOR** to 0.17.0: +Here is a summary of the API **PRIOR** to 0.17.0: - ``Series.sort`` is **INPLACE** while ``DataFrame.sort`` returns a new object. - ``Series.order`` returns a new object @@ -256,19 +255,19 @@ will show a ``FutureWarning``. To sort by the **values**: -================================= ==================================== +================================== ==================================== Previous Replacement -================================= ==================================== -\* ``Series.order()`` ``Series.sort_values()`` -\* ``Series.sort()`` ``Series.sort_values(inplace=True)`` -\* ``DataFrame.sort(columns=...)`` ``DataFrame.sort_values(by=...)`` -================================= ==================================== +================================== ==================================== +\* ``Series.order()`` ``Series.sort_values()`` +\* ``Series.sort()`` ``Series.sort_values(inplace=True)`` +\* ``DataFrame.sort(columns=...)`` ``DataFrame.sort_values(by=...)`` +================================== ==================================== To sort by the **index**: -================================= ==================================== -Previous Equivalent -================================= ==================================== +================================== ==================================== +Previous Replacement +================================== ==================================== ``Series.sort_index()`` ``Series.sort_index()`` ``Series.sortlevel(level=...)`` ``Series.sort_index(level=...``) ``DataFrame.sort_index()`` ``DataFrame.sort_index()`` @@ -281,8 +280,8 @@ We have also deprecated and changed similar methods in two Series-like classes, ================================== ==================================== Previous Replacement ================================== ==================================== -\* ``Index.order()`` ``Index.sort_values()`` -\* ``Categorical.order()`` ``Categorical.sort_values`` +\* ``Index.order()`` ``Index.sort_values()`` +\* ``Categorical.order()`` ``Categorical.sort_values`` ================================== ==================================== .. _whatsnew_0170.api_breaking.to_datetime: @@ -351,7 +350,7 @@ keyword argument to ``'coerce'`` instead of ``True``, as in ``convert_dates='coe 's': ['apple','banana']}) df -The old usage of ``DataFrame.convert_objects`` used `'coerce'` along with the +The old usage of ``DataFrame.convert_objects`` used ``'coerce'`` along with the type. .. code-block:: python @@ -366,8 +365,7 @@ Now the ``coerce`` keyword must be explicitly used. In earlier versions of pandas, ``DataFrame.convert_objects`` would not coerce numeric types when there were no values convertible to a numeric type. This returns -the original DataFrame with no conversion. This change alters -this behavior so that converts all non-number-like strings to ``NaN``. +the original DataFrame with no conversion. .. code-block:: python @@ -378,6 +376,9 @@ this behavior so that converts all non-number-like strings to ``NaN``. 0 a 1 b +THe new behavior will convert all non-number-like strings to ``NaN``, +when ``coerce=True`` is passed explicity. + .. ipython:: python pd.DataFrame({'s': ['a','b']}) @@ -517,7 +518,10 @@ New behavior: .. ipython:: python - df_with_missing.to_hdf('file.h5', 'df_with_missing', format = 'table', mode='w') + df_with_missing.to_hdf('file.h5', + 'df_with_missing', + format='table', + mode='w') pd.read_hdf('file.h5', 'df_with_missing') @@ -571,10 +575,10 @@ from ``7``. Changes to ``Categorical.unique`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``Categorical.unique`` now returns new ``Categorical`` which ``categories`` and ``codes`` that are unique, rather than returning ``np.array`` (:issue:`10508`) +``Categorical.unique`` now returns new ``Categoricals`` with ``categories`` and ``codes`` that are unique, rather than returning ``np.array`` (:issue:`10508`) - unordered category: values and categories are sorted by appearance order. -- ordered category: values are sorted by appearance order, categories keeps existing order. +- ordered category: values are sorted by appearance order, categories keep existing order. .. ipython :: python @@ -597,25 +601,23 @@ Other API Changes - Line and kde plot with ``subplots=True`` now uses default colors, not all black. Specify ``color='k'`` to draw all lines in black (:issue:`9894`) - Calling the ``.value_counts`` method on a Series with ``categorical`` dtype now returns a Series with a ``CategoricalIndex`` (:issue:`10704`) -- Enable writing Excel files in :ref:`memory <_io.excel_writing_buffer>` using StringIO/BytesIO (:issue:`7074`) -- Enable serialization of lists and dicts to strings in ExcelWriter (:issue:`8188`) - Allow passing `kwargs` to the interpolation methods (:issue:`10378`). -- Serialize metadata properties of subclasses of pandas objects (:issue:`10553`). +- The metadata properties of subclasses of pandas objects will now be serialized (:issue:`10553`). - Allow ``DataFrame`` with ``MultiIndex`` columns to be written to Excel (:issue:`10564`). This was changed in 0.16.2 as the read-back method could not always guarantee perfect fidelity (:issue:`9794`). - ``groupby`` using ``Categorical`` follows the same rule as ``Categorical.unique`` described above (:issue:`10508`) +- Improved error message when concatenating an empty iterable of dataframes (:issue:`9157`) + - ``NaT``'s methods now either raise ``ValueError``, or return ``np.nan`` or ``NaT`` (:issue:`9513`) =============================== =============================================================== Behavior Methods =============================== =============================================================== - ``return np.nan`` ``weekday``, ``isoweekday`` - ``return NaT`` ``date``, ``now``, ``replace``, ``to_datetime``, ``today`` - ``return np.datetime64('NaT')`` ``to_datetime64`` (unchanged) - ``raise ValueError`` All other public methods (names not beginning with underscores) + return ``np.nan`` ``weekday``, ``isoweekday`` + return ``NaT`` ``date``, ``now``, ``replace``, ``to_datetime``, ``today`` + return ``np.datetime64('NaT')`` ``to_datetime64`` (unchanged) + raise ``ValueError`` All other public methods (names not beginning with underscores) =============================== =============================================================== -- Improved error message when concatenating an empty iterable of dataframes (:issue:`9157`) - .. _whatsnew_0170.deprecations: Deprecations @@ -703,6 +705,8 @@ Removal of prior version deprecations/changes Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ + +- Development support for benchmarking with the `Air Speed Velocity library `_ (:issue:`8316`) - Added vbench benchmarks for alternative ExcelWriter engines and reading Excel files (:issue:`7171`) - Performance improvements in ``Categorical.value_counts`` (:issue:`10804`) - Performance improvements in ``SeriesGroupBy.nunique`` and ``SeriesGroupBy.value_counts`` (:issue:`10820`) @@ -720,6 +724,8 @@ Performance Improvements Bug Fixes ~~~~~~~~~ + +- Bug in incorrection computation of ``.mean()`` on ``timedelta64[ns]`` because of overflow (:issue:`9442`) - Bug in ``DataFrame.to_html(index=False)`` renders unnecessary ``name`` row (:issue:`10344`) - Bug in ``DataFrame.apply`` when function returns categorical series. (:issue:`9573`) - Bug in ``to_datetime`` with invalid dates and formats supplied (:issue:`10154`) @@ -814,4 +820,6 @@ Bug Fixes - Bug in ``read_msgpack`` where encoding is not respected (:issue:`10580`) - Bug preventing access to the first index when using ``iloc`` with a list containing the appropriate negative integer (:issue:`10547`, :issue:`10779`) - Bug in ``TimedeltaIndex`` formatter causing error while trying to save ``DataFrame`` with ``TimedeltaIndex`` using ``to_csv`` (:issue:`10833`) +- Bug in ``Timedelta`` raising error when slicing from 0s (:issue:`10583`) - Bug in ``DataFrame.where`` when handling Series slicing (:issue:`10218`, :issue:`9558`) +- Bug where ``pd.read_gbq`` throws ``ValueError`` when Bigquery returns zero rows (:issue:`10273`) diff --git a/pandas/core/common.py b/pandas/core/common.py index 245535e47abd8..72ea6d14456b0 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -63,6 +63,7 @@ def __str__(self): _int8_max = np.iinfo(np.int8).max _int16_max = np.iinfo(np.int16).max _int32_max = np.iinfo(np.int32).max +_int64_max = np.iinfo(np.int64).max # define abstract base classes to enable isinstance type checking on our # objects diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index c70fb6339517d..447a273a1e171 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -21,7 +21,8 @@ is_bool_dtype, is_object_dtype, is_datetime64_dtype, is_timedelta64_dtype, is_datetime_or_timedelta_dtype, _get_dtype, - is_int_or_datetime_dtype, is_any_int_dtype) + is_int_or_datetime_dtype, is_any_int_dtype, + _int64_max) class disallow(object): @@ -145,7 +146,7 @@ def _get_fill_value(dtype, fill_value=None, fill_value_typ=None): else: if fill_value_typ == '+inf': # need the max int here - return np.iinfo(np.int64).max + return _int64_max else: return tslib.iNaT @@ -223,7 +224,12 @@ def _wrap_results(result, dtype): result = result.view(dtype) elif is_timedelta64_dtype(dtype): if not isinstance(result, np.ndarray): - result = lib.Timedelta(result) + + # raise if we have a timedelta64[ns] which is too large + if np.fabs(result) > _int64_max: + raise ValueError("overflow in timedelta operation") + + result = lib.Timedelta(result, unit='ns') else: result = result.astype('i8').view(dtype) @@ -247,6 +253,8 @@ def nansum(values, axis=None, skipna=True): dtype_sum = dtype_max if is_float_dtype(dtype): dtype_sum = dtype + elif is_timedelta64_dtype(dtype): + dtype_sum = np.float64 the_sum = values.sum(axis, dtype=dtype_sum) the_sum = _maybe_null_out(the_sum, axis, mask) @@ -260,7 +268,7 @@ def nanmean(values, axis=None, skipna=True): dtype_sum = dtype_max dtype_count = np.float64 - if is_integer_dtype(dtype): + if is_integer_dtype(dtype) or is_timedelta64_dtype(dtype): dtype_sum = np.float64 elif is_float_dtype(dtype): dtype_sum = dtype diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 06ad8827a5642..1dff195e4b54f 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -121,7 +121,7 @@ def get_service(self, credentials): try: from apiclient.discovery import build - + except ImportError: raise ImportError('Could not import Google API Client.') @@ -279,7 +279,7 @@ def _parse_data(schema, rows): field_type) page_array[row_num][col_num] = field_value - return DataFrame(page_array) + return DataFrame(page_array, columns=col_names) def _parse_entry(field_value, field_type): if field_value is None or field_value == 'null': @@ -338,7 +338,10 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=Fals page = pages.pop() dataframe_list.append(_parse_data(schema, page)) - final_df = concat(dataframe_list, ignore_index = True) + if len(dataframe_list) > 0: + final_df = concat(dataframe_list, ignore_index=True) + else: + final_df = _parse_data(schema, []) # Reindex the DataFrame on the provided column if index_col is not None: diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py index 5417842d3f863..f04eeb03f790e 100644 --- a/pandas/io/tests/test_gbq.py +++ b/pandas/io/tests/test_gbq.py @@ -296,6 +296,13 @@ def test_download_dataset_larger_than_200k_rows(self): df = gbq.read_gbq("SELECT id FROM [publicdata:samples.wikipedia] GROUP EACH BY id ORDER BY id ASC LIMIT 200005", project_id=PROJECT_ID) self.assertEqual(len(df.drop_duplicates()), 200005) + def test_zero_rows(self): + # Bug fix for https://github.com/pydata/pandas/issues/10273 + df = gbq.read_gbq("SELECT title, language FROM [publicdata:samples.wikipedia] where timestamp=-9999999", project_id=PROJECT_ID) + expected_result = DataFrame(columns=['title', 'language']) + self.assert_frame_equal(df, expected_result) + + class TestToGBQIntegration(tm.TestCase): # This class requires bq.py to be installed for setup/teardown. # It will also need to be preconfigured with a default dataset, diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 753e76fd1faea..e84dc6c692737 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -404,6 +404,13 @@ def test_timedelta_range(self): result = timedelta_range('0 days',freq='30T',periods=50) tm.assert_index_equal(result, expected) + # issue10583 + df = pd.DataFrame(np.random.normal(size=(10,4))) + df.index = pd.timedelta_range(start='0s', periods=10, freq='s') + expected = df.loc[pd.Timedelta('0s'):,:] + result = df.loc['0s':,:] + assert_frame_equal(expected, result) + def test_numeric_conversions(self): self.assertEqual(ct(0), np.timedelta64(0,'ns')) self.assertEqual(ct(10), np.timedelta64(10,'ns')) @@ -686,6 +693,25 @@ def test_timedelta_ops(self): s = Series([Timestamp('2015-02-03'), Timestamp('2015-02-07'), Timestamp('2015-02-15')]) self.assertEqual(s.diff().median(), timedelta(days=6)) + def test_overflow(self): + # GH 9442 + s = Series(pd.date_range('20130101',periods=100000,freq='H')) + s[0] += pd.Timedelta('1s 1ms') + + # mean + result = (s-s.min()).mean() + expected = pd.Timedelta((pd.DatetimeIndex((s-s.min())).asi8/len(s)).sum()) + + # the computation is converted to float so might be some loss of precision + self.assertTrue(np.allclose(result.value/1000, expected.value/1000)) + + # sum + self.assertRaises(ValueError, lambda : (s-s.min()).sum()) + s1 = s[0:10000] + self.assertRaises(ValueError, lambda : (s1-s1.min()).sum()) + s2 = s[0:1000] + result = (s2-s2.min()).sum() + def test_timedelta_ops_scalar(self): # GH 6808 base = pd.to_datetime('20130101 09:01:12.123456') diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 369993b4c54d1..77ac362181a2b 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -2265,9 +2265,8 @@ class Timedelta(_Timedelta): return "m" elif self._h: return "h" - elif self._d: + else: return "D" - raise ValueError("invalid resolution") def round(self, reso): """