Skip to content

Commit f71193e

Browse files
Merge remote-tracking branch 'upstream/master' into add-read_excel-checks
2 parents 8252d3b + c3c4ccf commit f71193e

File tree

31 files changed

+438
-117
lines changed

31 files changed

+438
-117
lines changed

asv_bench/benchmarks/arithmetic.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,59 @@ def time_frame_op_with_series_axis1(self, opname):
101101
getattr(operator, opname)(self.df, self.ser)
102102

103103

104+
class FrameWithFrameWide:
105+
# Many-columns, mixed dtypes
106+
107+
params = [
108+
[
109+
# GH#32779 has discussion of which operators are included here
110+
operator.add,
111+
operator.floordiv,
112+
operator.gt,
113+
]
114+
]
115+
param_names = ["op"]
116+
117+
def setup(self, op):
118+
# we choose dtypes so as to make the blocks
119+
# a) not perfectly match between right and left
120+
# b) appreciably bigger than single columns
121+
n_cols = 2000
122+
n_rows = 500
123+
124+
# construct dataframe with 2 blocks
125+
arr1 = np.random.randn(n_rows, int(n_cols / 2)).astype("f8")
126+
arr2 = np.random.randn(n_rows, int(n_cols / 2)).astype("f4")
127+
df = pd.concat(
128+
[pd.DataFrame(arr1), pd.DataFrame(arr2)], axis=1, ignore_index=True,
129+
)
130+
# should already be the case, but just to be sure
131+
df._consolidate_inplace()
132+
133+
# TODO: GH#33198 the setting here shoudlnt need two steps
134+
arr1 = np.random.randn(n_rows, int(n_cols / 4)).astype("f8")
135+
arr2 = np.random.randn(n_rows, int(n_cols / 2)).astype("i8")
136+
arr3 = np.random.randn(n_rows, int(n_cols / 4)).astype("f8")
137+
df2 = pd.concat(
138+
[pd.DataFrame(arr1), pd.DataFrame(arr2), pd.DataFrame(arr3)],
139+
axis=1,
140+
ignore_index=True,
141+
)
142+
# should already be the case, but just to be sure
143+
df2._consolidate_inplace()
144+
145+
self.left = df
146+
self.right = df2
147+
148+
def time_op_different_blocks(self, op):
149+
# blocks (and dtypes) are not aligned
150+
op(self.left, self.right)
151+
152+
def time_op_same_blocks(self, op):
153+
# blocks (and dtypes) are aligned
154+
op(self.left, self.left)
155+
156+
104157
class Ops:
105158

106159
params = [[True, False], ["default", 1]]

doc/source/whatsnew/v1.1.0.rst

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -612,7 +612,7 @@ Performance improvements
612612
and :meth:`~pandas.core.groupby.groupby.Groupby.last` (:issue:`34178`)
613613
- Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`).
614614
- Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`).
615-
615+
- Performance improvement in arithmetic operations between two :class:`DataFrame` objects (:issue:`32779`)
616616

617617
.. ---------------------------------------------------------------------------
618618
@@ -817,9 +817,10 @@ Groupby/resample/rolling
817817
- Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`)
818818
- Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where None is not preserved in object dtype (:issue:`32800`)
819819
- Bug in :meth:`Rolling.min` and :meth:`Rolling.max`: Growing memory usage after multiple calls when using a fixed window (:issue:`30726`)
820+
- Bug in :meth:`Series.groupby` would raise ``ValueError`` when grouping by :class:`PeriodIndex` level (:issue:`34010`)
820821
- Bug in :meth:`GroupBy.agg`, :meth:`GroupBy.transform`, and :meth:`GroupBy.resample` where subclasses are not preserved (:issue:`28330`)
821822
- Bug in :meth:`GroupBy.rolling.apply` ignores args and kwargs parameters (:issue:`33433`)
822-
823+
- Bug in :meth:`DataFrameGroupby.std` and :meth:`DataFrameGroupby.sem` would modify grouped-by columns when ``as_index=False`` (:issue:`10355`)
823824

824825
Reshaping
825826
^^^^^^^^^
@@ -884,6 +885,7 @@ Other
884885
- Bug in :meth:`DataFrame.__dir__` caused a segfault when using unicode surrogates in a column name (:issue:`25509`)
885886
- Bug in :meth:`DataFrame.plot.scatter` caused an error when plotting variable marker sizes (:issue:`32904`)
886887
- :class:`IntegerArray` now implements the ``sum`` operation (:issue:`33172`)
888+
- More informative error message with ``np.min`` or ``np.max`` on unordered :class:`Categorical` (:issue:`33115`)
887889
- Bug in :class:`Tick` comparisons raising ``TypeError`` when comparing against timedelta-like objects (:issue:`34088`)
888890

889891
.. ---------------------------------------------------------------------------

pandas/_libs/internals.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ cdef class BlockPlacement:
4949
else:
5050
# Cython memoryview interface requires ndarray to be writeable.
5151
arr = np.require(val, dtype=np.int64, requirements='W')
52-
assert arr.ndim == 1
52+
assert arr.ndim == 1, arr.shape
5353
self._as_array = arr
5454
self._has_array = True
5555

pandas/_libs/tslibs/period.pyx

Lines changed: 46 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ from pandas._libs.tslibs.ccalendar cimport (
5252
from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS
5353
from pandas._libs.tslibs.frequencies cimport (
5454
attrname_to_abbrevs,
55-
get_base_alias,
5655
get_freq_code,
5756
get_freq_str,
5857
get_rule_month,
@@ -1600,9 +1599,7 @@ cdef class _Period:
16001599
raise IncompatibleFrequency("Input cannot be converted to "
16011600
f"Period(freq={self.freqstr})")
16021601
elif util.is_offset_object(other):
1603-
freqstr = other.rule_code
1604-
base = get_base_alias(freqstr)
1605-
if base == self.freq.rule_code:
1602+
if other.base == self.freq.base:
16061603
ordinal = self.ordinal + other.n
16071604
return Period(ordinal=ordinal, freq=self.freq)
16081605
msg = DIFFERENT_FREQ.format(cls=type(self).__name__,
@@ -1613,58 +1610,57 @@ cdef class _Period:
16131610
return NotImplemented
16141611

16151612
def __add__(self, other):
1616-
if is_period_object(self):
1617-
if (PyDelta_Check(other) or util.is_timedelta64_object(other) or
1618-
util.is_offset_object(other)):
1619-
return self._add_delta(other)
1620-
elif other is NaT:
1613+
if not is_period_object(self):
1614+
# cython semantics; this is analogous to a call to __radd__
1615+
if self is NaT:
16211616
return NaT
1622-
elif util.is_integer_object(other):
1623-
ordinal = self.ordinal + other * self.freq.n
1624-
return Period(ordinal=ordinal, freq=self.freq)
1625-
elif (PyDateTime_Check(other) or
1626-
is_period_object(other) or util.is_datetime64_object(other)):
1627-
# can't add datetime-like
1628-
# GH#17983
1629-
sname = type(self).__name__
1630-
oname = type(other).__name__
1631-
raise TypeError(f"unsupported operand type(s) for +: '{sname}' "
1632-
f"and '{oname}'")
1633-
else: # pragma: no cover
1634-
return NotImplemented
1635-
elif is_period_object(other):
1636-
# this can be reached via __radd__ because of cython rules
1637-
return other + self
1638-
else:
1639-
return NotImplemented
1617+
return other.__add__(self)
1618+
1619+
if (PyDelta_Check(other) or util.is_timedelta64_object(other) or
1620+
util.is_offset_object(other)):
1621+
return self._add_delta(other)
1622+
elif other is NaT:
1623+
return NaT
1624+
elif util.is_integer_object(other):
1625+
ordinal = self.ordinal + other * self.freq.n
1626+
return Period(ordinal=ordinal, freq=self.freq)
1627+
elif (PyDateTime_Check(other) or
1628+
is_period_object(other) or util.is_datetime64_object(other)):
1629+
# can't add datetime-like
1630+
# GH#17983
1631+
sname = type(self).__name__
1632+
oname = type(other).__name__
1633+
raise TypeError(f"unsupported operand type(s) for +: '{sname}' "
1634+
f"and '{oname}'")
1635+
1636+
return NotImplemented
16401637

16411638
def __sub__(self, other):
1642-
if is_period_object(self):
1643-
if (PyDelta_Check(other) or util.is_timedelta64_object(other) or
1644-
util.is_offset_object(other)):
1645-
neg_other = -other
1646-
return self + neg_other
1647-
elif util.is_integer_object(other):
1648-
ordinal = self.ordinal - other * self.freq.n
1649-
return Period(ordinal=ordinal, freq=self.freq)
1650-
elif is_period_object(other):
1651-
if other.freq != self.freq:
1652-
msg = DIFFERENT_FREQ.format(cls=type(self).__name__,
1653-
own_freq=self.freqstr,
1654-
other_freq=other.freqstr)
1655-
raise IncompatibleFrequency(msg)
1656-
# GH 23915 - mul by base freq since __add__ is agnostic of n
1657-
return (self.ordinal - other.ordinal) * self.freq.base
1658-
elif other is NaT:
1659-
return NaT
1660-
return NotImplemented
1661-
elif is_period_object(other):
1662-
# this can be reached via __rsub__ because of cython rules
1639+
if not is_period_object(self):
1640+
# cython semantics; this is like a call to __rsub__
16631641
if self is NaT:
16641642
return NaT
16651643
return NotImplemented
1666-
else:
1667-
return NotImplemented
1644+
1645+
elif (PyDelta_Check(other) or util.is_timedelta64_object(other) or
1646+
util.is_offset_object(other)):
1647+
neg_other = -other
1648+
return self + neg_other
1649+
elif util.is_integer_object(other):
1650+
ordinal = self.ordinal - other * self.freq.n
1651+
return Period(ordinal=ordinal, freq=self.freq)
1652+
elif is_period_object(other):
1653+
if other.freq != self.freq:
1654+
msg = DIFFERENT_FREQ.format(cls=type(self).__name__,
1655+
own_freq=self.freqstr,
1656+
other_freq=other.freqstr)
1657+
raise IncompatibleFrequency(msg)
1658+
# GH 23915 - mul by base freq since __add__ is agnostic of n
1659+
return (self.ordinal - other.ordinal) * self.freq.base
1660+
elif other is NaT:
1661+
return NaT
1662+
1663+
return NotImplemented
16681664

16691665
def asfreq(self, freq, how='E') -> "Period":
16701666
"""

pandas/core/arrays/categorical.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
from pandas._libs import NaT, algos as libalgos, hashtable as htable
1111
from pandas._typing import ArrayLike, Dtype, Ordered, Scalar
12+
from pandas.compat.numpy import function as nv
1213
from pandas.util._decorators import cache_readonly, deprecate_kwarg, doc
1314
from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs
1415

@@ -2077,7 +2078,7 @@ def _reduce(self, name, axis=0, **kwargs):
20772078
return func(**kwargs)
20782079

20792080
@deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna")
2080-
def min(self, skipna=True):
2081+
def min(self, skipna=True, **kwargs):
20812082
"""
20822083
The minimum value of the object.
20832084
@@ -2096,6 +2097,7 @@ def min(self, skipna=True):
20962097
-------
20972098
min : the minimum of this `Categorical`
20982099
"""
2100+
nv.validate_min((), kwargs)
20992101
self.check_for_ordered("min")
21002102

21012103
if not len(self._codes):
@@ -2112,7 +2114,7 @@ def min(self, skipna=True):
21122114
return self.categories[pointer]
21132115

21142116
@deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna")
2115-
def max(self, skipna=True):
2117+
def max(self, skipna=True, **kwargs):
21162118
"""
21172119
The maximum value of the object.
21182120
@@ -2131,6 +2133,7 @@ def max(self, skipna=True):
21312133
-------
21322134
max : the maximum of this `Categorical`
21332135
"""
2136+
nv.validate_max((), kwargs)
21342137
self.check_for_ordered("max")
21352138

21362139
if not len(self._codes):

pandas/core/arrays/datetimelike.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,10 @@ def _validate_comparison_value(self, other):
9898

9999
@unpack_zerodim_and_defer(opname)
100100
def wrapper(self, other):
101+
if self.ndim > 1 and getattr(other, "shape", None) == self.shape:
102+
# TODO: handle 2D-like listlikes
103+
return op(self.ravel(), other.ravel()).reshape(self.shape)
104+
101105
try:
102106
other = _validate_comparison_value(self, other)
103107
except InvalidComparison:
@@ -1308,18 +1312,20 @@ def _addsub_object_array(self, other: np.ndarray, op):
13081312
"""
13091313
assert op in [operator.add, operator.sub]
13101314
if len(other) == 1:
1315+
# If both 1D then broadcasting is unambiguous
1316+
# TODO(EA2D): require self.ndim == other.ndim here
13111317
return op(self, other[0])
13121318

13131319
warnings.warn(
1314-
"Adding/subtracting array of DateOffsets to "
1320+
"Adding/subtracting object-dtype array to "
13151321
f"{type(self).__name__} not vectorized",
13161322
PerformanceWarning,
13171323
)
13181324

13191325
# Caller is responsible for broadcasting if necessary
13201326
assert self.shape == other.shape, (self.shape, other.shape)
13211327

1322-
res_values = op(self.astype("O"), np.array(other))
1328+
res_values = op(self.astype("O"), np.asarray(other))
13231329
result = array(res_values.ravel())
13241330
result = extract_array(result, extract_numpy=True).reshape(self.shape)
13251331
return result

pandas/core/arrays/interval.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
from pandas.core.dtypes.dtypes import IntervalDtype
2828
from pandas.core.dtypes.generic import (
2929
ABCDatetimeIndex,
30-
ABCExtensionArray,
3130
ABCIndexClass,
3231
ABCIntervalIndex,
3332
ABCPeriodIndex,
@@ -767,7 +766,7 @@ def size(self) -> int:
767766
# Avoid materializing self.values
768767
return self.left.size
769768

770-
def shift(self, periods: int = 1, fill_value: object = None) -> ABCExtensionArray:
769+
def shift(self, periods: int = 1, fill_value: object = None) -> "IntervalArray":
771770
if not len(self) or periods == 0:
772771
return self.copy()
773772

pandas/core/arrays/period.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -667,8 +667,8 @@ def _addsub_int_array(
667667

668668
def _add_offset(self, other):
669669
assert not isinstance(other, Tick)
670-
base = libfrequencies.get_base_alias(other.rule_code)
671-
if base != self.freq.rule_code:
670+
671+
if other.base != self.freq.base:
672672
raise raise_on_incompatible(self, other)
673673

674674
# Note: when calling parent class's _add_timedeltalike_scalar,

pandas/core/dtypes/cast.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,9 @@ def is_nested_object(obj) -> bool:
103103
This may not be necessarily be performant.
104104
105105
"""
106-
if isinstance(obj, ABCSeries) and is_object_dtype(obj):
106+
if isinstance(obj, ABCSeries) and is_object_dtype(obj.dtype):
107107

108-
if any(isinstance(v, ABCSeries) for v in obj.values):
108+
if any(isinstance(v, ABCSeries) for v in obj._values):
109109
return True
110110

111111
return False

pandas/core/frame.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -455,6 +455,7 @@ def __init__(
455455
mgr = self._init_mgr(
456456
data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy
457457
)
458+
458459
elif isinstance(data, dict):
459460
mgr = init_dict(data, index, columns, dtype=dtype)
460461
elif isinstance(data, ma.MaskedArray):
@@ -1159,7 +1160,7 @@ def dot(self, other):
11591160
left = self.reindex(columns=common, copy=False)
11601161
right = other.reindex(index=common, copy=False)
11611162
lvals = left.values
1162-
rvals = right.values
1163+
rvals = right._values
11631164
else:
11641165
left = self
11651166
lvals = self.values
@@ -1891,7 +1892,7 @@ def to_records(
18911892
if index:
18921893
if isinstance(self.index, MultiIndex):
18931894
# array of tuples to numpy cols. copy copy copy
1894-
ix_vals = list(map(np.array, zip(*self.index.values)))
1895+
ix_vals = list(map(np.array, zip(*self.index._values)))
18951896
else:
18961897
ix_vals = [self.index.values]
18971898

@@ -3009,7 +3010,7 @@ def _setitem_frame(self, key, value):
30093010
raise ValueError("Array conditional must be same shape as self")
30103011
key = self._constructor(key, **self._construct_axes_dict())
30113012

3012-
if key.values.size and not is_bool_dtype(key.values):
3013+
if key.size and not is_bool_dtype(key.values):
30133014
raise TypeError(
30143015
"Must pass DataFrame or 2-d ndarray with boolean values only"
30153016
)
@@ -5754,10 +5755,11 @@ def _construct_result(self, result) -> "DataFrame":
57545755
-------
57555756
DataFrame
57565757
"""
5757-
out = self._constructor(result, index=self.index, copy=False)
5758+
out = self._constructor(result, copy=False)
57585759
# Pin columns instead of passing to constructor for compat with
57595760
# non-unique columns case
57605761
out.columns = self.columns
5762+
out.index = self.index
57615763
return out
57625764

57635765
def combine(
@@ -7450,7 +7452,7 @@ def applymap(self, func) -> "DataFrame":
74507452
def infer(x):
74517453
if x.empty:
74527454
return lib.map_infer(x, func)
7453-
return lib.map_infer(x.astype(object).values, func)
7455+
return lib.map_infer(x.astype(object)._values, func)
74547456

74557457
return self.apply(infer)
74567458

0 commit comments

Comments
 (0)