Skip to content

Commit e7f5bf2

Browse files
committed
Merge remote-tracking branch 'upstream/master' into bump_numpy
2 parents 9dc846a + 12a0dc4 commit e7f5bf2

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+573
-327
lines changed

.pep8speaks.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ pycodestyle:
1111
max-line-length: 79
1212
ignore:
1313
- W503, # line break before binary operator
14+
- W504, # line break after binary operator
1415
- E402, # module level import not at top of file
1516
- E722, # do not use bare except
1617
- E731, # do not assign a lambda expression, use a def

asv_bench/benchmarks/period.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,3 +119,6 @@ def time_align(self):
119119

120120
def time_intersection(self):
121121
self.index[:750].intersection(self.index[250:])
122+
123+
def time_unique(self):
124+
self.index.unique()

doc/source/api.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,15 @@ Top-level evaluation
245245

246246
eval
247247

248+
Hashing
249+
~~~~~~~
250+
251+
.. autosummary::
252+
:toctree: generated/
253+
254+
util.hash_array
255+
util.hash_pandas_object
256+
248257
Testing
249258
~~~~~~~
250259

doc/source/whatsnew/v0.24.0.txt

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ Pandas has gained the ability to hold integer dtypes with missing values. This l
4848
Here is an example of the usage.
4949

5050
We can construct a ``Series`` with the specified dtype. The dtype string ``Int64`` is a pandas ``ExtensionDtype``. Specifying a list or array using the traditional missing value
51-
marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`, :issue:`22441`)
51+
marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`, :issue:`22441`, :issue:`21789`, :issue:`22346`)
5252

5353
.. ipython:: python
5454

@@ -91,6 +91,13 @@ These dtypes can be merged & reshaped & casted.
9191
pd.concat([df[['A']], df[['B', 'C']]], axis=1).dtypes
9292
df['A'].astype(float)
9393

94+
Reduction and groupby operations such as 'sum' work.
95+
96+
.. ipython:: python
97+
98+
df.sum()
99+
df.groupby('B').A.sum()
100+
94101
.. warning::
95102

96103
The Integer NA support currently uses the captilized dtype version, e.g. ``Int8`` as compared to the traditional ``int8``. This may be changed at a future date.
@@ -397,6 +404,22 @@ is the case with :attr:`Period.end_time`, for example
397404

398405
p.end_time
399406

407+
.. _whatsnew_0240.api_breaking.frame_to_dict_index_orient:
408+
409+
Raise ValueError in ``DataFrame.to_dict(orient='index')``
410+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
411+
412+
Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with
413+
``orient='index'`` and a non-unique index instead of losing data (:issue:`22801`)
414+
415+
.. ipython:: python
416+
:okexcept:
417+
418+
df = pd.DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A'])
419+
df
420+
421+
df.to_dict(orient='index')
422+
400423
.. _whatsnew_0240.api.datetimelike.normalize:
401424

402425
Tick DateOffset Normalize Restrictions
@@ -572,7 +595,9 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
572595
- :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`).
573596
- Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
574597
- Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`)
598+
- Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`)
575599
- Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`)
600+
- Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`)
576601

577602
.. _whatsnew_0240.api.incompatibilities:
578603

@@ -708,6 +733,7 @@ Performance Improvements
708733
(:issue:`21372`)
709734
- Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`)
710735
- Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`)
736+
- Improved performance of :func:`PeriodIndex.unique` (:issue:`23083`)
711737

712738

713739
.. _whatsnew_0240.docs:

pandas/conftest.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,30 @@ def all_arithmetic_operators(request):
131131
return request.param
132132

133133

134+
_all_numeric_reductions = ['sum', 'max', 'min',
135+
'mean', 'prod', 'std', 'var', 'median',
136+
'kurt', 'skew']
137+
138+
139+
@pytest.fixture(params=_all_numeric_reductions)
140+
def all_numeric_reductions(request):
141+
"""
142+
Fixture for numeric reduction names
143+
"""
144+
return request.param
145+
146+
147+
_all_boolean_reductions = ['all', 'any']
148+
149+
150+
@pytest.fixture(params=_all_boolean_reductions)
151+
def all_boolean_reductions(request):
152+
"""
153+
Fixture for boolean reduction names
154+
"""
155+
return request.param
156+
157+
134158
_cython_table = pd.core.base.SelectionMixin._cython_table.items()
135159

136160

pandas/core/algorithms.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -274,8 +274,8 @@ def match(to_match, values, na_sentinel=-1):
274274
# replace but return a numpy array
275275
# use a Series because it handles dtype conversions properly
276276
from pandas import Series
277-
result = Series(result.ravel()).replace(-1, na_sentinel).values.\
278-
reshape(result.shape)
277+
result = Series(result.ravel()).replace(-1, na_sentinel)
278+
result = result.values.reshape(result.shape)
279279

280280
return result
281281

pandas/core/arrays/base.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,10 @@ class ExtensionArray(object):
6363
as they only compose abstract methods. Still, a more efficient
6464
implementation may be available, and these methods can be overridden.
6565
66+
One can implement methods to handle array reductions.
67+
68+
* _reduce
69+
6670
This class does not inherit from 'abc.ABCMeta' for performance reasons.
6771
Methods and properties required by the interface raise
6872
``pandas.errors.AbstractMethodError`` and no ``register`` method is
@@ -466,6 +470,11 @@ def _values_for_factorize(self):
466470
as NA in the factorization routines, so it will be coded as
467471
`na_sentinal` and not included in `uniques`. By default,
468472
``np.nan`` is used.
473+
474+
Notes
475+
-----
476+
The values returned by this method are also used in
477+
:func:`pandas.util.hash_pandas_object`.
469478
"""
470479
return self.astype(object), np.nan
471480

@@ -670,6 +679,33 @@ def _ndarray_values(self):
670679
"""
671680
return np.array(self)
672681

682+
def _reduce(self, name, skipna=True, **kwargs):
683+
"""
684+
Return a scalar result of performing the reduction operation.
685+
686+
Parameters
687+
----------
688+
name : str
689+
Name of the function, supported values are:
690+
{ any, all, min, max, sum, mean, median, prod,
691+
std, var, sem, kurt, skew }.
692+
skipna : bool, default True
693+
If True, skip NaN values.
694+
**kwargs
695+
Additional keyword arguments passed to the reduction function.
696+
Currently, `ddof` is the only supported kwarg.
697+
698+
Returns
699+
-------
700+
scalar
701+
702+
Raises
703+
------
704+
TypeError : subclass does not define reductions
705+
"""
706+
raise TypeError("cannot perform {name} with type {dtype}".format(
707+
name=name, dtype=self.dtype))
708+
673709

674710
class ExtensionOpsMixin(object):
675711
"""

pandas/core/arrays/categorical.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2069,14 +2069,12 @@ def _reverse_indexer(self):
20692069
return result
20702070

20712071
# reduction ops #
2072-
def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
2073-
filter_type=None, **kwds):
2074-
""" perform the reduction type operation """
2072+
def _reduce(self, name, axis=0, skipna=True, **kwargs):
20752073
func = getattr(self, name, None)
20762074
if func is None:
20772075
msg = 'Categorical cannot perform the operation {op}'
20782076
raise TypeError(msg.format(op=name))
2079-
return func(numeric_only=numeric_only, **kwds)
2077+
return func(**kwargs)
20802078

20812079
def min(self, numeric_only=None, **kwargs):
20822080
""" The minimum value of the object.

pandas/core/arrays/datetimelike.py

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
from pandas.util._decorators import deprecate_kwarg
4242

4343

44-
def _make_comparison_op(op, cls):
44+
def _make_comparison_op(cls, op):
4545
# TODO: share code with indexes.base version? Main difference is that
4646
# the block for MultiIndex was removed here.
4747
def cmp_method(self, other):
@@ -740,6 +740,9 @@ def __isub__(self, other):
740740
# --------------------------------------------------------------
741741
# Comparison Methods
742742

743+
# Called by _add_comparison_methods defined in ExtensionOpsMixin
744+
_create_comparison_method = classmethod(_make_comparison_op)
745+
743746
def _evaluate_compare(self, other, op):
744747
"""
745748
We have been called because a comparison between
@@ -773,21 +776,8 @@ def _evaluate_compare(self, other, op):
773776
result[mask] = filler
774777
return result
775778

776-
# TODO: get this from ExtensionOpsMixin
777-
@classmethod
778-
def _add_comparison_methods(cls):
779-
""" add in comparison methods """
780-
# DatetimeArray and TimedeltaArray comparison methods will
781-
# call these as their super(...) methods
782-
cls.__eq__ = _make_comparison_op(operator.eq, cls)
783-
cls.__ne__ = _make_comparison_op(operator.ne, cls)
784-
cls.__lt__ = _make_comparison_op(operator.lt, cls)
785-
cls.__gt__ = _make_comparison_op(operator.gt, cls)
786-
cls.__le__ = _make_comparison_op(operator.le, cls)
787-
cls.__ge__ = _make_comparison_op(operator.ge, cls)
788-
789-
790-
DatetimeLikeArrayMixin._add_comparison_methods()
779+
780+
DatetimeLikeArrayMixin._add_comparison_ops()
791781

792782

793783
# -------------------------------------------------------------------

pandas/core/arrays/integer.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from pandas.compat import u, range, string_types
99
from pandas.compat import set_function_name
1010

11+
from pandas.core import nanops
1112
from pandas.core.dtypes.cast import astype_nansafe
1213
from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass
1314
from pandas.core.dtypes.common import (
@@ -529,6 +530,31 @@ def cmp_method(self, other):
529530
name = '__{name}__'.format(name=op.__name__)
530531
return set_function_name(cmp_method, name, cls)
531532

533+
def _reduce(self, name, skipna=True, **kwargs):
534+
data = self._data
535+
mask = self._mask
536+
537+
# coerce to a nan-aware float if needed
538+
if mask.any():
539+
data = self._data.astype('float64')
540+
data[mask] = self._na_value
541+
542+
op = getattr(nanops, 'nan' + name)
543+
result = op(data, axis=0, skipna=skipna, mask=mask)
544+
545+
# if we have a boolean op, don't coerce
546+
if name in ['any', 'all']:
547+
pass
548+
549+
# if we have a preservable numeric op,
550+
# provide coercion back to an integer type if possible
551+
elif name in ['sum', 'min', 'max', 'prod'] and notna(result):
552+
int_result = int(result)
553+
if int_result == result:
554+
result = int_result
555+
556+
return result
557+
532558
def _maybe_mask_result(self, result, mask, other, op_name):
533559
"""
534560
Parameters

pandas/core/base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -395,8 +395,8 @@ def nested_renaming_depr(level=4):
395395

396396
elif isinstance(obj, ABCSeries):
397397
nested_renaming_depr()
398-
elif isinstance(obj, ABCDataFrame) and \
399-
k not in obj.columns:
398+
elif (isinstance(obj, ABCDataFrame) and
399+
k not in obj.columns):
400400
raise KeyError(
401401
"Column '{col}' does not exist!".format(col=k))
402402

pandas/core/frame.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1224,6 +1224,10 @@ def to_dict(self, orient='dict', into=dict):
12241224
for k, v in zip(self.columns, np.atleast_1d(row)))
12251225
for row in self.values]
12261226
elif orient.lower().startswith('i'):
1227+
if not self.index.is_unique:
1228+
raise ValueError(
1229+
"DataFrame index must be unique for orient='index'."
1230+
)
12271231
return into_c((t[0], dict(zip(self.columns, t[1:])))
12281232
for t in self.itertuples())
12291233
else:

pandas/core/generic.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5651,8 +5651,8 @@ def fillna(self, value=None, method=None, axis=None, inplace=False,
56515651
# fill in 2d chunks
56525652
result = {col: s.fillna(method=method, value=value)
56535653
for col, s in self.iteritems()}
5654-
new_obj = self._constructor.\
5655-
from_dict(result).__finalize__(self)
5654+
prelim_obj = self._constructor.from_dict(result)
5655+
new_obj = prelim_obj.__finalize__(self)
56565656
new_data = new_obj._data
56575657

56585658
else:

pandas/core/groupby/generic.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1027,8 +1027,9 @@ def nunique(self, dropna=True):
10271027
try:
10281028
sorter = np.lexsort((val, ids))
10291029
except TypeError: # catches object dtypes
1030-
assert val.dtype == object, \
1031-
'val.dtype must be object, got %s' % val.dtype
1030+
msg = ('val.dtype must be object, got {dtype}'
1031+
.format(dtype=val.dtype))
1032+
assert val.dtype == object, msg
10321033
val, _ = algorithms.factorize(val, sort=False)
10331034
sorter = np.lexsort((val, ids))
10341035
_isna = lambda a: a == -1

pandas/core/groupby/groupby.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -578,8 +578,8 @@ def wrapper(*args, **kwargs):
578578
# a little trickery for aggregation functions that need an axis
579579
# argument
580580
kwargs_with_axis = kwargs.copy()
581-
if 'axis' not in kwargs_with_axis or \
582-
kwargs_with_axis['axis'] is None:
581+
if ('axis' not in kwargs_with_axis or
582+
kwargs_with_axis['axis'] is None):
583583
kwargs_with_axis['axis'] = self.axis
584584

585585
def curried_with_axis(x):
@@ -1490,8 +1490,10 @@ def nth(self, n, dropna=None):
14901490
self._set_group_selection()
14911491

14921492
if not dropna:
1493-
mask = np.in1d(self._cumcount_array(), nth_values) | \
1494-
np.in1d(self._cumcount_array(ascending=False) + 1, -nth_values)
1493+
mask_left = np.in1d(self._cumcount_array(), nth_values)
1494+
mask_right = np.in1d(self._cumcount_array(ascending=False) + 1,
1495+
-nth_values)
1496+
mask = mask_left | mask_right
14951497

14961498
out = self._selected_obj[mask]
14971499
if not self.as_index:
@@ -1552,8 +1554,8 @@ def nth(self, n, dropna=None):
15521554
result.loc[mask] = np.nan
15531555

15541556
# reset/reindex to the original groups
1555-
if len(self.obj) == len(dropped) or \
1556-
len(result) == len(self.grouper.result_index):
1557+
if (len(self.obj) == len(dropped) or
1558+
len(result) == len(self.grouper.result_index)):
15571559
result.index = self.grouper.result_index
15581560
else:
15591561
result = result.reindex(self.grouper.result_index)

0 commit comments

Comments
 (0)