Skip to content

Commit 93b122c

Browse files
committed
Merge remote-tracking branch 'upstream/master' into grp-desc-perf
2 parents b846bc2 + 0ab8eb2 commit 93b122c

File tree

153 files changed

+6551
-5025
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

153 files changed

+6551
-5025
lines changed

.pep8speaks.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ pycodestyle:
1313
- W503, # line break before binary operator
1414
- W504, # line break after binary operator
1515
- E402, # module level import not at top of file
16-
- E722, # do not use bare except
1716
- E731, # do not assign a lambda expression, use a def
1817
- C406, # Unnecessary list literal - rewrite as a dict literal.
1918
- C408, # Unnecessary dict call - rewrite as a literal.

asv_bench/benchmarks/timeseries.py

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from datetime import timedelta
22

3+
import dateutil
34
import numpy as np
45
from pandas import to_datetime, date_range, Series, DataFrame, period_range
56
from pandas.tseries.frequencies import infer_freq
@@ -57,7 +58,10 @@ def time_to_pydatetime(self, index_type):
5758

5859
class TzLocalize(object):
5960

60-
def setup(self):
61+
params = [None, 'US/Eastern', 'UTC', dateutil.tz.tzutc()]
62+
param_names = 'tz'
63+
64+
def setup(self, tz):
6165
dst_rng = date_range(start='10/29/2000 1:00:00',
6266
end='10/29/2000 1:59:59', freq='S')
6367
self.index = date_range(start='10/29/2000',
@@ -68,8 +72,8 @@ def setup(self):
6872
end='10/29/2000 3:00:00',
6973
freq='S'))
7074

71-
def time_infer_dst(self):
72-
self.index.tz_localize('US/Eastern', ambiguous='infer')
75+
def time_infer_dst(self, tz):
76+
self.index.tz_localize(tz, ambiguous='infer')
7377

7478

7579
class ResetIndex(object):
@@ -377,15 +381,35 @@ def time_dup_string_tzoffset_dates(self, cache):
377381

378382
class DatetimeAccessor(object):
379383

380-
def setup(self):
384+
params = [None, 'US/Eastern', 'UTC', dateutil.tz.tzutc()]
385+
param_names = 'tz'
386+
387+
def setup(self, tz):
381388
N = 100000
382-
self.series = Series(date_range(start='1/1/2000', periods=N, freq='T'))
389+
self.series = Series(
390+
date_range(start='1/1/2000', periods=N, freq='T', tz=tz)
391+
)
383392

384-
def time_dt_accessor(self):
393+
def time_dt_accessor(self, tz):
385394
self.series.dt
386395

387-
def time_dt_accessor_normalize(self):
396+
def time_dt_accessor_normalize(self, tz):
388397
self.series.dt.normalize()
389398

399+
def time_dt_accessor_month_name(self, tz):
400+
self.series.dt.month_name()
401+
402+
def time_dt_accessor_day_name(self, tz):
403+
self.series.dt.day_name()
404+
405+
def time_dt_accessor_time(self, tz):
406+
self.series.dt.time
407+
408+
def time_dt_accessor_date(self, tz):
409+
self.series.dt.date
410+
411+
def time_dt_accessor_year(self, tz):
412+
self.series.dt.year
413+
390414

391415
from .pandas_vb_common import setup # noqa: F401

asv_bench/benchmarks/timestamp.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from pandas import Timestamp
44
import pytz
5+
import dateutil
56

67

78
class TimestampConstruction(object):
@@ -29,7 +30,8 @@ def time_fromtimestamp(self):
2930

3031

3132
class TimestampProperties(object):
32-
_tzs = [None, pytz.timezone('Europe/Amsterdam')]
33+
_tzs = [None, pytz.timezone('Europe/Amsterdam'), pytz.UTC,
34+
dateutil.tz.tzutc()]
3335
_freqs = [None, 'B']
3436
params = [_tzs, _freqs]
3537
param_names = ['tz', 'freq']
@@ -87,7 +89,8 @@ def time_microsecond(self, tz, freq):
8789

8890

8991
class TimestampOps(object):
90-
params = [None, 'US/Eastern']
92+
params = [None, 'US/Eastern', pytz.UTC,
93+
dateutil.tz.tzutc()]
9194
param_names = ['tz']
9295

9396
def setup(self, tz):
@@ -102,6 +105,17 @@ def time_replace_None(self, tz):
102105
def time_to_pydatetime(self, tz):
103106
self.ts.to_pydatetime()
104107

108+
def time_normalize(self, tz):
109+
self.ts.normalize()
110+
111+
def time_tz_convert(self, tz):
112+
if self.ts.tz is not None:
113+
self.ts.tz_convert(tz)
114+
115+
def time_tz_localize(self, tz):
116+
if self.ts.tz is None:
117+
self.ts.tz_localize(tz)
118+
105119

106120
class TimestampAcrossDst(object):
107121
def setup(self):

doc/source/conf.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -586,7 +586,7 @@ def linkcode_resolve(domain, info):
586586
for part in fullname.split('.'):
587587
try:
588588
obj = getattr(obj, part)
589-
except:
589+
except AttributeError:
590590
return None
591591

592592
try:
@@ -595,14 +595,14 @@ def linkcode_resolve(domain, info):
595595
fn = inspect.getsourcefile(inspect.unwrap(obj))
596596
else:
597597
fn = inspect.getsourcefile(obj)
598-
except:
598+
except TypeError:
599599
fn = None
600600
if not fn:
601601
return None
602602

603603
try:
604604
source, lineno = inspect.getsourcelines(obj)
605-
except:
605+
except OSError:
606606
lineno = None
607607

608608
if lineno:

doc/source/contributing.rst

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -591,21 +591,14 @@ run this slightly modified command::
591591

592592
git diff master --name-only -- "*.py" | grep "pandas/" | xargs flake8
593593

594-
Note that on Windows, these commands are unfortunately not possible because
595-
commands like ``grep`` and ``xargs`` are not available natively. To imitate the
596-
behavior with the commands above, you should run::
594+
Windows does not support the ``grep`` and ``xargs`` commands (unless installed
595+
for example via the `MinGW <http://www.mingw.org/>`__ toolchain), but one can
596+
imitate the behaviour as follows::
597597

598-
git diff master --name-only -- "*.py"
598+
for /f %i in ('git diff upstream/master --name-only ^| findstr pandas/') do flake8 %i
599599

600-
This will list all of the Python files that have been modified. The only ones
601-
that matter during linting are any whose directory filepath begins with "pandas."
602-
For each filepath, copy and paste it after the ``flake8`` command as shown below:
603-
604-
flake8 <python-filepath>
605-
606-
Alternatively, you can install the ``grep`` and ``xargs`` commands via the
607-
`MinGW <http://www.mingw.org/>`__ toolchain, and it will allow you to run the
608-
commands above.
600+
This will also get all the files being changed by the PR (and within the
601+
``pandas/`` folder), and run ``flake8`` on them one after the other.
609602

610603
.. _contributing.import-formatting:
611604

doc/source/install.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,9 @@ Optional Dependencies
286286
`xsel <http://www.vergenet.net/~conrad/software/xsel/>`__, or
287287
`xclip <https://github.com/astrand/xclip/>`__: necessary to use
288288
:func:`~pandas.read_clipboard`. Most package managers on Linux distributions will have ``xclip`` and/or ``xsel`` immediately available for installation.
289-
* `pandas-gbq <https://pandas-gbq.readthedocs.io/en/latest/install.html#dependencies>`__: for Google BigQuery I/O.
289+
* `pandas-gbq
290+
<https://pandas-gbq.readthedocs.io/en/latest/install.html#dependencies>`__:
291+
for Google BigQuery I/O. (pandas-gbq >= 0.8.0)
290292

291293

292294
* `Backports.lzma <https://pypi.org/project/backports.lzma/>`__: Only for Python 2, for writing to and/or reading from an xz compressed DataFrame in CSV; Python 3 support is built into the standard library.

doc/source/whatsnew/v0.24.0.rst

Lines changed: 73 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ New features
2424
the user to override the engine's default behavior to include or omit the
2525
dataframe's indexes from the resulting Parquet file. (:issue:`20768`)
2626
- :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`)
27-
27+
- :func:`DataFrame.to_string` now accepts ``decimal`` as an argument, allowing
28+
the user to specify which decimal separator should be used in the output. (:issue:`23614`)
2829

2930
.. _whatsnew_0240.enhancements.extension_array_operators:
3031

@@ -183,6 +184,47 @@ array, but rather an ``ExtensionArray``:
183184
This is the same behavior as ``Series.values`` for categorical data. See
184185
:ref:`whatsnew_0240.api_breaking.interval_values` for more.
185186

187+
.. _whatsnew_0240.enhancements.join_with_two_multiindexes:
188+
189+
Joining with two multi-indexes
190+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
191+
192+
:func:`Datafame.merge` and :func:`Dataframe.join` can now be used to join multi-indexed ``Dataframe`` instances on the overlaping index levels (:issue:`6360`)
193+
194+
See the :ref:`Merge, join, and concatenate
195+
<merging.Join_with_two_multi_indexes>` documentation section.
196+
197+
.. ipython:: python
198+
199+
index_left = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'),
200+
('K1', 'X2')],
201+
names=['key', 'X'])
202+
203+
204+
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
205+
'B': ['B0', 'B1', 'B2']},
206+
index=index_left)
207+
208+
209+
index_right = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'),
210+
('K2', 'Y2'), ('K2', 'Y3')],
211+
names=['key', 'Y'])
212+
213+
214+
right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'],
215+
'D': ['D0', 'D1', 'D2', 'D3']},
216+
index=index_right)
217+
218+
219+
left.join(right)
220+
221+
For earlier versions this can be done using the following.
222+
223+
.. ipython:: python
224+
225+
pd.merge(left.reset_index(), right.reset_index(),
226+
on=['key'], how='inner').set_index(['key', 'X', 'Y'])
227+
186228
.. _whatsnew_0240.enhancements.rename_axis:
187229

188230
Renaming names in a MultiIndex
@@ -218,9 +260,12 @@ Other Enhancements
218260
- :meth:`Series.droplevel` and :meth:`DataFrame.droplevel` are now implemented (:issue:`20342`)
219261
- Added support for reading from/writing to Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`, :issue:`23094`)
220262
- :func:`to_gbq` and :func:`read_gbq` signature and documentation updated to
221-
reflect changes from the `Pandas-GBQ library version 0.6.0
222-
<https://pandas-gbq.readthedocs.io/en/latest/changelog.html#changelog-0-6-0>`__.
223-
(:issue:`21627`, :issue:`22557`)
263+
reflect changes from the `Pandas-GBQ library version 0.8.0
264+
<https://pandas-gbq.readthedocs.io/en/latest/changelog.html#changelog-0-8-0>`__.
265+
Adds a ``credentials`` argument, which enables the use of any kind of
266+
`google-auth credentials
267+
<https://google-auth.readthedocs.io/en/latest/>`__. (:issue:`21627`,
268+
:issue:`22557`, :issue:`23662`)
224269
- New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`)
225270
- :func:`read_html` copies cell data across ``colspan`` and ``rowspan``, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`)
226271
- :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep`` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`)
@@ -246,6 +291,7 @@ Other Enhancements
246291
- :meth:`read_excel()` now accepts ``usecols`` as a list of column names or callable (:issue:`18273`)
247292
- :meth:`MultiIndex.to_flat_index` has been added to flatten multiple levels into a single-level :class:`Index` object.
248293
- :meth:`DataFrame.to_stata` and :class:` pandas.io.stata.StataWriter117` can write mixed sting columns to Stata strl format (:issue:`23633`)
294+
- :meth:`DataFrame.between_time` and :meth:`DataFrame.at_time` have gained the an ``axis`` parameter (:issue: `8839`)
249295

250296
.. _whatsnew_0240.api_breaking:
251297

@@ -271,17 +317,19 @@ If installed, we now require:
271317
+-----------------+-----------------+----------+
272318
| bottleneck | 1.2.0 | |
273319
+-----------------+-----------------+----------+
320+
| fastparquet | 0.1.2 | |
321+
+-----------------+-----------------+----------+
274322
| matplotlib | 2.0.0 | |
275323
+-----------------+-----------------+----------+
276324
| numexpr | 2.6.1 | |
277325
+-----------------+-----------------+----------+
278-
| pytables | 3.4.2 | |
279-
+-----------------+-----------------+----------+
280-
| scipy | 0.18.1 | |
326+
| pandas-gbq | 0.8.0 | |
281327
+-----------------+-----------------+----------+
282328
| pyarrow | 0.7.0 | |
283329
+-----------------+-----------------+----------+
284-
| fastparquet | 0.1.2 | |
330+
| pytables | 3.4.2 | |
331+
+-----------------+-----------------+----------+
332+
| scipy | 0.18.1 | |
285333
+-----------------+-----------------+----------+
286334

287335
Additionally we no longer depend on `feather-format` for feather based storage
@@ -960,7 +1008,10 @@ Other API Changes
9601008
- Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
9611009
- :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`)
9621010
- Comparing :class:`Timedelta` to be less or greater than unknown types now raises a ``TypeError`` instead of returning ``False`` (:issue:`20829`)
1011+
- :meth:`Categorical.searchsorted`, when supplied a scalar value to search for, now returns a scalar instead of an array (:issue:`23466`).
1012+
- :meth:`Categorical.searchsorted` now raises a ``KeyError`` rather that a ``ValueError``, if a searched for key is not found in its categories (:issue:`23466`).
9631013
- :meth:`Index.hasnans` and :meth:`Series.hasnans` now always return a python boolean. Previously, a python or a numpy boolean could be returned, depending on circumstances (:issue:`23294`).
1014+
- The order of the arguments of :func:`DataFrame.to_html` and :func:`DataFrame.to_string` is rearranged to be consistent with each other. (:issue:`23614`)
9641015

9651016
.. _whatsnew_0240.deprecations:
9661017

@@ -981,10 +1032,14 @@ Deprecations
9811032
- The ``fastpath`` keyword of the different Index constructors is deprecated (:issue:`23110`).
9821033
- :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have deprecated the ``errors`` argument in favor of the ``nonexistent`` argument (:issue:`8917`)
9831034
- The class ``FrozenNDArray`` has been deprecated. When unpickling, ``FrozenNDArray`` will be unpickled to ``np.ndarray`` once this class is removed (:issue:`9031`)
1035+
- The methods :meth:`DataFrame.update` and :meth:`Panel.update` have deprecated the ``raise_conflict=False|True`` keyword in favor of ``errors='ignore'|'raise'`` (:issue:`23585`)
9841036
- Deprecated the `nthreads` keyword of :func:`pandas.read_feather` in favor of
9851037
`use_threads` to reflect the changes in pyarrow 0.11.0. (:issue:`23053`)
9861038
- :func:`pandas.read_excel` has deprecated accepting ``usecols`` as an integer. Please pass in a list of ints from 0 to ``usecols`` inclusive instead (:issue:`23527`)
9871039
- Constructing a :class:`TimedeltaIndex` from data with ``datetime64``-dtyped data is deprecated, will raise ``TypeError`` in a future version (:issue:`23539`)
1040+
- The ``keep_tz=False`` option (the default) of the ``keep_tz`` keyword of
1041+
:meth:`DatetimeIndex.to_series` is deprecated (:issue:`17832`).
1042+
- Timezone converting a tz-aware ``datetime.datetime`` or :class:`Timestamp` with :class:`Timestamp` and the ``tz`` argument is now deprecated. Instead, use :meth:`Timestamp.tz_convert` (:issue:`23579`)
9881043

9891044
.. _whatsnew_0240.deprecations.datetimelike_int_ops:
9901045

@@ -1088,6 +1143,8 @@ Performance Improvements
10881143
- Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`)
10891144
- Improved performance of :func:`PeriodIndex.unique` (:issue:`23083`)
10901145
- Improved performance of :func:`pd.concat` for `Series` objects (:issue:`23404`)
1146+
- Improved performance of :meth:`DatetimeIndex.normalize` and :meth:`Timestamp.normalize` for timezone naive or UTC datetimes (:issue:`23634`)
1147+
- Improved performance of :meth:`DatetimeIndex.tz_localize` and various ``DatetimeIndex`` attributes with dateutil UTC timezone (:issue:`23772`)
10911148

10921149

10931150
.. _whatsnew_0240.docs:
@@ -1217,8 +1274,8 @@ Numeric
12171274
Strings
12181275
^^^^^^^
12191276

1220-
-
1221-
-
1277+
- Bug in :meth:`Index.str.partition` was not nan-safe (:issue:`23558`).
1278+
- Bug in :meth:`Index.str.split` was not nan-safe (:issue:`23677`).
12221279
-
12231280

12241281
Interval
@@ -1315,16 +1372,20 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
13151372
- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`)
13161373
- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`)
13171374
- Bug in :func:`read_csv()` in which memory management was prematurely optimized for the C engine when the data was being read in chunks (:issue:`23509`)
1375+
- Bug in :func:`read_csv()` in unnamed columns were being improperly identified when extracting a multi-index (:issue:`23687`)
13181376
- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`)
13191377
- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`)
13201378
- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`)
13211379
- Bug in :meth:`detect_client_encoding` where potential ``IOError`` goes unhandled when importing in a mod_wsgi process due to restricted access to stdout. (:issue:`21552`)
1380+
- Bug in :func:`to_html()` with ``index=False`` misses truncation indicators (...) on truncated DataFrame (:issue:`15019`, :issue:`22783`)
13221381
- Bug in :func:`DataFrame.to_string()` that broke column alignment when ``index=False`` and width of first column's values is greater than the width of first column's header (:issue:`16839`, :issue:`13032`)
13231382
- Bug in :func:`DataFrame.to_string()` that caused representations of :class:`DataFrame` to not take up the whole window (:issue:`22984`)
13241383
- Bug in :func:`DataFrame.to_csv` where a single level MultiIndex incorrectly wrote a tuple. Now just the value of the index is written (:issue:`19589`).
13251384
- Bug in :meth:`HDFStore.append` when appending a :class:`DataFrame` with an empty string column and ``min_itemsize`` < 8 (:issue:`12242`)
1385+
- Bug in :func:`read_csv()` in which incorrect error messages were being raised when ``skipfooter`` was passed in along with ``nrows``, ``iterator``, or ``chunksize`` (:issue:`23711`)
13261386
- Bug in :meth:`read_csv()` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`)
13271387
- Bug in :meth:`read_html()` in which the error message was not displaying the valid flavors when an invalid one was provided (:issue:`23549`)
1388+
- Bug in :meth:`read_excel()` in which extraneous header names were extracted, even though none were specified (:issue:`11733`)
13281389
- Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`20480`)
13291390
- Bug in :meth:`read_excel()` in which ``usecols`` was not being validated for proper column names when passed in as a string (:issue:`20480`)
13301391

@@ -1373,6 +1434,7 @@ Reshaping
13731434
- Bug in :func:`pandas.concat` when concatenating a multicolumn DataFrame with tz-aware data against a DataFrame with a different number of columns (:issue:`22796`)
13741435
- Bug in :func:`merge_asof` where confusing error message raised when attempting to merge with missing values (:issue:`23189`)
13751436
- Bug in :meth:`DataFrame.nsmallest` and :meth:`DataFrame.nlargest` for dataframes that have a :class:`MultiIndex` for columns (:issue:`23033`).
1437+
- Bug in :meth:`DataFrame.append` with a :class:`Series` with a dateutil timezone would raise a ``TypeError`` (:issue:`23682`)
13761438

13771439
.. _whatsnew_0240.bug_fixes.sparse:
13781440

@@ -1387,6 +1449,7 @@ Sparse
13871449
- Bug in ``DataFrame.groupby`` not including ``fill_value`` in the groups for non-NA ``fill_value`` when grouping by a sparse column (:issue:`5078`)
13881450
- Bug in unary inversion operator (``~``) on a ``SparseSeries`` with boolean values. The performance of this has also been improved (:issue:`22835`)
13891451
- Bug in :meth:`SparseArary.unique` not returning the unique values (:issue:`19595`)
1452+
- Bug in :meth:`SparseArray.nonzero` and :meth:`SparseDataFrame.dropna` returning shifted/incorrect results (:issue:`21172`)
13901453

13911454
Build Changes
13921455
^^^^^^^^^^^^^

0 commit comments

Comments
 (0)