pandas-dev · terrytangyuan · Sep 1, 2015
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -130,11 +130,10 @@ Other enhancements
 ^^^^^^^^^^^^^^^^^^
 
 - `read_sql` and `to_sql` can accept database URI as con parameter (:issue:`10214`)
-
 - Enable `read_hdf` to be used without specifying a key when the HDF file contains a single dataset (:issue:`10443`)
-
+- Enable writing Excel files in :ref:`memory <_io.excel_writing_buffer>` using StringIO/BytesIO (:issue:`7074`)
+- Enable serialization of lists and dicts to strings in ExcelWriter (:issue:`8188`)
 - Added functionality to use the ``base`` argument when resampling a ``TimeDeltaIndex`` (:issue:`10530`)
-
 - ``DatetimeIndex`` can be instantiated using strings contains ``NaT`` (:issue:`7599`)
 - The string parsing of ``to_datetime``, ``Timestamp`` and ``DatetimeIndex`` has been made consistent. (:issue:`7599`)
 
@@ -235,7 +234,7 @@ Changes to sorting API
 
 The sorting API has had some longtime inconsistencies. (:issue:`9816`, :issue:`8239`).
 
-Here is a summary of the **PRIOR** to 0.17.0:
+Here is a summary of the API **PRIOR** to 0.17.0:
 
 - ``Series.sort`` is **INPLACE** while ``DataFrame.sort`` returns a new object.
 - ``Series.order`` returns a new object
@@ -256,19 +255,19 @@ will show a ``FutureWarning``.
 
 To sort by the **values**:
 
-=================================     ====================================
+==================================    ====================================
 Previous                              Replacement
-=================================     ====================================
-\* ``Series.order()``                   ``Series.sort_values()``
-\* ``Series.sort()``                    ``Series.sort_values(inplace=True)``
-\* ``DataFrame.sort(columns=...)``      ``DataFrame.sort_values(by=...)``
-=================================     ====================================
+==================================    ====================================
+\* ``Series.order()``                 ``Series.sort_values()``
+\* ``Series.sort()``                  ``Series.sort_values(inplace=True)``
+\* ``DataFrame.sort(columns=...)``    ``DataFrame.sort_values(by=...)``
+==================================    ====================================
 
 To sort by the **index**:
 
-=================================     ====================================
-Previous                              Equivalent
-=================================     ====================================
+==================================    ====================================
+Previous                              Replacement
+==================================    ====================================
 ``Series.sort_index()``               ``Series.sort_index()``
 ``Series.sortlevel(level=...)``       ``Series.sort_index(level=...``)
 ``DataFrame.sort_index()``            ``DataFrame.sort_index()``
@@ -281,8 +280,8 @@ We have also deprecated and changed similar methods in two Series-like classes,
 ==================================    ====================================
 Previous                              Replacement
 ==================================    ====================================
-\* ``Index.order()``                     ``Index.sort_values()``
-\* ``Categorical.order()``               ``Categorical.sort_values``
+\* ``Index.order()``                  ``Index.sort_values()``
+\* ``Categorical.order()``            ``Categorical.sort_values``
 ==================================    ====================================
 
 .. _whatsnew_0170.api_breaking.to_datetime:
@@ -351,7 +350,7 @@ keyword argument to ``'coerce'`` instead of ``True``, as in ``convert_dates='coe
                       's': ['apple','banana']})
    df
 
-The old usage of ``DataFrame.convert_objects`` used `'coerce'` along with the
+The old usage of ``DataFrame.convert_objects`` used ``'coerce'`` along with the
 type.
 
 .. code-block:: python
@@ -366,8 +365,7 @@ Now the ``coerce`` keyword must be explicitly used.
 
 In earlier versions of pandas, ``DataFrame.convert_objects`` would not coerce
 numeric types when there were no values convertible to a numeric type. This returns
-the original DataFrame with no conversion. This change alters
-this behavior so that converts all non-number-like strings to ``NaN``.
+the original DataFrame with no conversion.
 
 .. code-block:: python
 
@@ -378,6 +376,9 @@ this behavior so that converts all non-number-like strings to ``NaN``.
        0  a
        1  b
 
+THe new behavior will convert all non-number-like strings to ``NaN``,
+when ``coerce=True`` is passed explicity.
+
 .. ipython:: python
 
    pd.DataFrame({'s': ['a','b']})
@@ -517,7 +518,10 @@ New behavior:
 
 .. ipython:: python
 
-   df_with_missing.to_hdf('file.h5', 'df_with_missing', format = 'table', mode='w')
+   df_with_missing.to_hdf('file.h5',
+                          'df_with_missing',
+                           format='table',
+                           mode='w')
 
    pd.read_hdf('file.h5', 'df_with_missing')
 
@@ -571,10 +575,10 @@ from ``7``.
 Changes to ``Categorical.unique``
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-``Categorical.unique`` now returns new ``Categorical`` which ``categories`` and ``codes`` that are unique, rather than returning ``np.array`` (:issue:`10508`)
+``Categorical.unique`` now returns new ``Categoricals`` with ``categories`` and ``codes`` that are unique, rather than returning ``np.array`` (:issue:`10508`)
 
 - unordered category: values and categories are sorted by appearance order.
-- ordered category: values are sorted by appearance order, categories keeps existing order.
+- ordered category: values are sorted by appearance order, categories keep existing order.
 
 .. ipython :: python
 
@@ -597,25 +601,23 @@ Other API Changes
 
 - Line and kde plot with ``subplots=True`` now uses default colors, not all black. Specify ``color='k'`` to draw all lines in black (:issue:`9894`)
 - Calling the ``.value_counts`` method on a Series with ``categorical`` dtype now returns a Series with a ``CategoricalIndex`` (:issue:`10704`)
-- Enable writing Excel files in :ref:`memory <_io.excel_writing_buffer>` using StringIO/BytesIO (:issue:`7074`)
-- Enable serialization of lists and dicts to strings in ExcelWriter (:issue:`8188`)
 - Allow passing `kwargs` to the interpolation methods (:issue:`10378`).
-- Serialize metadata properties of subclasses of pandas objects (:issue:`10553`).
+- The metadata properties of subclasses of pandas objects will now be serialized (:issue:`10553`).
 - Allow ``DataFrame`` with ``MultiIndex`` columns to be written to Excel (:issue:`10564`). This was changed in 0.16.2 as the read-back method could not always guarantee perfect fidelity (:issue:`9794`).
 - ``groupby`` using ``Categorical`` follows the same rule as ``Categorical.unique`` described above  (:issue:`10508`)
+- Improved error message when concatenating an empty iterable of dataframes (:issue:`9157`)
+
 - ``NaT``'s methods now either raise ``ValueError``, or return ``np.nan`` or ``NaT`` (:issue:`9513`)
 
   ===============================     ===============================================================
   Behavior                            Methods
   ===============================     ===============================================================
-  ``return np.nan``                   ``weekday``, ``isoweekday``
-  ``return NaT``                      ``date``, ``now``, ``replace``, ``to_datetime``, ``today``
-  ``return np.datetime64('NaT')``     ``to_datetime64`` (unchanged)
-  ``raise ValueError``                All other public methods (names not beginning with underscores)
+  return ``np.nan``                   ``weekday``, ``isoweekday``
+  return ``NaT``                      ``date``, ``now``, ``replace``, ``to_datetime``, ``today``
+  return ``np.datetime64('NaT')``     ``to_datetime64`` (unchanged)
+  raise ``ValueError``                All other public methods (names not beginning with underscores)
   ===============================     ===============================================================
 
-- Improved error message when concatenating an empty iterable of dataframes (:issue:`9157`)
-
 .. _whatsnew_0170.deprecations:
 
 Deprecations
@@ -703,6 +705,8 @@ Removal of prior version deprecations/changes
 
 Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
+
+- Development support for benchmarking with the `Air Speed Velocity library <https://github.com/spacetelescope/asv/>`_ (:issue:`8316`)
 - Added vbench benchmarks for alternative ExcelWriter engines and reading Excel files (:issue:`7171`)
 - Performance improvements in ``Categorical.value_counts`` (:issue:`10804`)
 - Performance improvements in ``SeriesGroupBy.nunique`` and ``SeriesGroupBy.value_counts`` (:issue:`10820`)
@@ -720,6 +724,8 @@ Performance Improvements
 
 Bug Fixes
 ~~~~~~~~~
+
+- Bug in incorrection computation of ``.mean()`` on ``timedelta64[ns]`` because of overflow (:issue:`9442`)
 - Bug in ``DataFrame.to_html(index=False)`` renders unnecessary ``name`` row (:issue:`10344`)
 - Bug in ``DataFrame.apply`` when function returns categorical series. (:issue:`9573`)
 - Bug in ``to_datetime`` with invalid dates and formats supplied (:issue:`10154`)
@@ -814,4 +820,6 @@ Bug Fixes
 - Bug in ``read_msgpack`` where encoding is not respected (:issue:`10580`)
 - Bug preventing access to the first index when using ``iloc`` with a list containing the appropriate negative integer (:issue:`10547`, :issue:`10779`)
 - Bug in ``TimedeltaIndex`` formatter causing error while trying to save ``DataFrame`` with ``TimedeltaIndex`` using ``to_csv`` (:issue:`10833`)
+- Bug in ``Timedelta`` raising error when slicing from 0s (:issue:`10583`)
 - Bug in ``DataFrame.where`` when handling Series slicing (:issue:`10218`, :issue:`9558`)
+- Bug where ``pd.read_gbq`` throws ``ValueError`` when Bigquery returns zero rows (:issue:`10273`)
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -63,6 +63,7 @@ def __str__(self):
 _int8_max = np.iinfo(np.int8).max
 _int16_max = np.iinfo(np.int16).max
 _int32_max = np.iinfo(np.int32).max
+_int64_max = np.iinfo(np.int64).max
 
 # define abstract base classes to enable isinstance type checking on our
 # objects

diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -21,7 +21,8 @@
                                 is_bool_dtype, is_object_dtype,
                                 is_datetime64_dtype, is_timedelta64_dtype,
                                 is_datetime_or_timedelta_dtype, _get_dtype,
-                                is_int_or_datetime_dtype, is_any_int_dtype)
+                                is_int_or_datetime_dtype, is_any_int_dtype,
+                                _int64_max)
 
 
 class disallow(object):
@@ -145,7 +146,7 @@ def _get_fill_value(dtype, fill_value=None, fill_value_typ=None):
         else:
             if fill_value_typ == '+inf':
                 # need the max int here
-                return np.iinfo(np.int64).max
+                return _int64_max
             else:
                 return tslib.iNaT
 
@@ -223,7 +224,12 @@ def _wrap_results(result, dtype):
             result = result.view(dtype)
     elif is_timedelta64_dtype(dtype):
         if not isinstance(result, np.ndarray):
-            result = lib.Timedelta(result)
+
+            # raise if we have a timedelta64[ns] which is too large
+            if np.fabs(result) > _int64_max:
+                raise ValueError("overflow in timedelta operation")
+
+            result = lib.Timedelta(result, unit='ns')
         else:
             result = result.astype('i8').view(dtype)
 
@@ -247,6 +253,8 @@ def nansum(values, axis=None, skipna=True):
     dtype_sum = dtype_max
     if is_float_dtype(dtype):
         dtype_sum = dtype
+    elif is_timedelta64_dtype(dtype):
+        dtype_sum = np.float64
     the_sum = values.sum(axis, dtype=dtype_sum)
     the_sum = _maybe_null_out(the_sum, axis, mask)
 
@@ -260,7 +268,7 @@ def nanmean(values, axis=None, skipna=True):
 
     dtype_sum = dtype_max
     dtype_count = np.float64
-    if is_integer_dtype(dtype):
+    if is_integer_dtype(dtype) or is_timedelta64_dtype(dtype):
         dtype_sum = np.float64
     elif is_float_dtype(dtype):
         dtype_sum = dtype

diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py
@@ -121,7 +121,7 @@ def get_service(self, credentials):
 
         try:
             from apiclient.discovery import build
-        
+
         except ImportError:
             raise ImportError('Could not import Google API Client.')
 
@@ -279,7 +279,7 @@ def _parse_data(schema, rows):
                                        field_type)
             page_array[row_num][col_num] = field_value
 
-    return DataFrame(page_array)
+    return DataFrame(page_array, columns=col_names)
 
 def _parse_entry(field_value, field_type):
     if field_value is None or field_value == 'null':
@@ -338,7 +338,10 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=Fals
         page = pages.pop()
         dataframe_list.append(_parse_data(schema, page))
 
-    final_df = concat(dataframe_list, ignore_index = True)
+    if len(dataframe_list) > 0:
+        final_df = concat(dataframe_list, ignore_index=True)
+    else:
+        final_df = _parse_data(schema, [])
 
     # Reindex the DataFrame on the provided column
     if index_col is not None:

diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py
@@ -296,6 +296,13 @@ def test_download_dataset_larger_than_200k_rows(self):
         df = gbq.read_gbq("SELECT id FROM [publicdata:samples.wikipedia] GROUP EACH BY id ORDER BY id ASC LIMIT 200005", project_id=PROJECT_ID)
         self.assertEqual(len(df.drop_duplicates()), 200005)
 
+    def test_zero_rows(self):
+        # Bug fix for https://github.com/pydata/pandas/issues/10273
+        df = gbq.read_gbq("SELECT title, language  FROM [publicdata:samples.wikipedia] where timestamp=-9999999", project_id=PROJECT_ID)
+        expected_result = DataFrame(columns=['title', 'language'])
+        self.assert_frame_equal(df, expected_result)
+
+
 class TestToGBQIntegration(tm.TestCase):
     # This class requires bq.py to be installed for setup/teardown. 
     # It will also need to be preconfigured with a default dataset,

diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py
@@ -404,6 +404,13 @@ def test_timedelta_range(self):
         result = timedelta_range('0 days',freq='30T',periods=50)
         tm.assert_index_equal(result, expected)
 
+        # issue10583
+        df = pd.DataFrame(np.random.normal(size=(10,4)))
+        df.index = pd.timedelta_range(start='0s', periods=10, freq='s')
+        expected = df.loc[pd.Timedelta('0s'):,:]
+        result = df.loc['0s':,:]
+        assert_frame_equal(expected, result)
+
     def test_numeric_conversions(self):
         self.assertEqual(ct(0), np.timedelta64(0,'ns'))
         self.assertEqual(ct(10), np.timedelta64(10,'ns'))
@@ -686,6 +693,25 @@ def test_timedelta_ops(self):
         s = Series([Timestamp('2015-02-03'), Timestamp('2015-02-07'), Timestamp('2015-02-15')])
         self.assertEqual(s.diff().median(), timedelta(days=6))
 
+    def test_overflow(self):
+        # GH 9442
+        s = Series(pd.date_range('20130101',periods=100000,freq='H'))
+        s[0] += pd.Timedelta('1s 1ms')
+
+        # mean
+        result = (s-s.min()).mean()
+        expected = pd.Timedelta((pd.DatetimeIndex((s-s.min())).asi8/len(s)).sum())
+
+        # the computation is converted to float so might be some loss of precision
+        self.assertTrue(np.allclose(result.value/1000, expected.value/1000))
+
+        # sum
+        self.assertRaises(ValueError, lambda : (s-s.min()).sum())
+        s1 = s[0:10000]
+        self.assertRaises(ValueError, lambda : (s1-s1.min()).sum())
+        s2 = s[0:1000]
+        result = (s2-s2.min()).sum()
+
     def test_timedelta_ops_scalar(self):
         # GH 6808
         base = pd.to_datetime('20130101 09:01:12.123456')

diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx
@@ -2265,9 +2265,8 @@ class Timedelta(_Timedelta):
            return "m"
         elif self._h:
            return "h"
-        elif self._d:
+        else:
            return "D"
-        raise ValueError("invalid resolution")
 
     def round(self, reso):
         """