Skip to content

Commit 81bc116

Browse files
Merge remote-tracking branch 'upstream/master' into numba
2 parents cc20a71 + 5bf5ae8 commit 81bc116

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+553
-588
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ jobs:
153153
run: |
154154
source activate pandas-dev
155155
pytest pandas/tests/frame/methods --array-manager
156+
pytest pandas/tests/arithmetic/ --array-manager
156157
157158
# indexing subset (temporary since other tests don't pass yet)
158159
pytest pandas/tests/frame/indexing/test_indexing.py::TestDataFrameIndexing::test_setitem_boolean --array-manager

asv_bench/benchmarks/arithmetic.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -116,16 +116,26 @@ class FrameWithFrameWide:
116116
operator.add,
117117
operator.floordiv,
118118
operator.gt,
119-
]
119+
],
120+
[
121+
# (n_rows, n_columns)
122+
(1_000_000, 10),
123+
(100_000, 100),
124+
(10_000, 1000),
125+
(1000, 10_000),
126+
],
120127
]
121-
param_names = ["op"]
128+
param_names = ["op", "shape"]
122129

123-
def setup(self, op):
130+
def setup(self, op, shape):
124131
# we choose dtypes so as to make the blocks
125132
# a) not perfectly match between right and left
126133
# b) appreciably bigger than single columns
127-
n_cols = 2000
128-
n_rows = 500
134+
n_rows, n_cols = shape
135+
136+
if op is operator.floordiv:
137+
# floordiv is much slower than the other operations -> use less data
138+
n_rows = n_rows // 10
129139

130140
# construct dataframe with 2 blocks
131141
arr1 = np.random.randn(n_rows, n_cols // 2).astype("f8")
@@ -137,7 +147,7 @@ def setup(self, op):
137147
df._consolidate_inplace()
138148

139149
# TODO: GH#33198 the setting here shoudlnt need two steps
140-
arr1 = np.random.randn(n_rows, n_cols // 4).astype("f8")
150+
arr1 = np.random.randn(n_rows, max(n_cols // 4, 3)).astype("f8")
141151
arr2 = np.random.randn(n_rows, n_cols // 2).astype("i8")
142152
arr3 = np.random.randn(n_rows, n_cols // 4).astype("f8")
143153
df2 = pd.concat(
@@ -151,11 +161,11 @@ def setup(self, op):
151161
self.left = df
152162
self.right = df2
153163

154-
def time_op_different_blocks(self, op):
164+
def time_op_different_blocks(self, op, shape):
155165
# blocks (and dtypes) are not aligned
156166
op(self.left, self.right)
157167

158-
def time_op_same_blocks(self, op):
168+
def time_op_same_blocks(self, op, shape):
159169
# blocks (and dtypes) are aligned
160170
op(self.left, self.left)
161171

asv_bench/benchmarks/groupby.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -461,6 +461,29 @@ def time_dtype_as_field(self, dtype, method, application):
461461
self.as_field_method()
462462

463463

464+
class GroupByCythonAgg:
465+
"""
466+
Benchmarks specifically targetting our cython aggregation algorithms
467+
(using a big enough dataframe with simple key, so a large part of the
468+
time is actually spent in the grouped aggregation).
469+
"""
470+
471+
param_names = ["dtype", "method"]
472+
params = [
473+
["float64"],
474+
["sum", "prod", "min", "max", "mean", "median", "var", "first", "last"],
475+
]
476+
477+
def setup(self, dtype, method):
478+
N = 1_000_000
479+
df = DataFrame(np.random.randn(N, 10), columns=list("abcdefghij"))
480+
df["key"] = np.random.randint(0, 100, size=N)
481+
self.df = df
482+
483+
def time_frame_agg(self, dtype, method):
484+
self.df.groupby("key").agg(method)
485+
486+
464487
class RankWithTies:
465488
# GH 21237
466489
param_names = ["dtype", "tie_method"]

asv_bench/benchmarks/libs.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
"""
2+
Benchmarks for code in pandas/_libs, excluding pandas/_libs/tslibs,
3+
which has its own directory
4+
"""
5+
import numpy as np
6+
7+
from pandas._libs.lib import (
8+
is_list_like,
9+
is_scalar,
10+
)
11+
12+
from pandas import (
13+
NA,
14+
NaT,
15+
)
16+
17+
# TODO: share with something in pd._testing?
18+
scalars = [
19+
0,
20+
1.0,
21+
1 + 2j,
22+
True,
23+
"foo",
24+
b"bar",
25+
None,
26+
np.datetime64(123, "ns"),
27+
np.timedelta64(123, "ns"),
28+
NaT,
29+
NA,
30+
]
31+
zero_dims = [np.array("123")]
32+
listlikes = [np.array([1, 2, 3]), {0: 1}, {1, 2, 3}, [1, 2, 3], (1, 2, 3)]
33+
34+
35+
class ScalarListLike:
36+
params = scalars + zero_dims + listlikes
37+
38+
def time_is_list_like(self, param):
39+
is_list_like(param)
40+
41+
def time_is_scalar(self, param):
42+
is_scalar(param)

doc/make.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ class DocBuilder:
3939

4040
def __init__(
4141
self,
42-
num_jobs=0,
42+
num_jobs="auto",
4343
include_api=True,
4444
whatsnew=False,
4545
single_doc=None,
@@ -135,7 +135,7 @@ def _sphinx_build(self, kind: str):
135135

136136
cmd = ["sphinx-build", "-b", kind]
137137
if self.num_jobs:
138-
cmd += ["-j", str(self.num_jobs)]
138+
cmd += ["-j", self.num_jobs]
139139
if self.warnings_are_errors:
140140
cmd += ["-W", "--keep-going"]
141141
if self.verbosity:
@@ -304,7 +304,7 @@ def main():
304304
"command", nargs="?", default="html", help=f"command to run: {joined}"
305305
)
306306
argparser.add_argument(
307-
"--num-jobs", type=int, default=0, help="number of jobs used by sphinx-build"
307+
"--num-jobs", default="auto", help="number of jobs used by sphinx-build"
308308
)
309309
argparser.add_argument(
310310
"--no-api", default=False, help="omit api and autosummary", action="store_true"

doc/source/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -423,7 +423,7 @@
423423
if include_api:
424424
intersphinx_mapping = {
425425
"dateutil": ("https://dateutil.readthedocs.io/en/latest/", None),
426-
"matplotlib": ("https://matplotlib.org/", None),
426+
"matplotlib": ("https://matplotlib.org/stable/", None),
427427
"numpy": ("https://numpy.org/doc/stable/", None),
428428
"pandas-gbq": ("https://pandas-gbq.readthedocs.io/en/latest/", None),
429429
"py": ("https://pylib.readthedocs.io/en/latest/", None),

doc/source/development/contributing.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -612,7 +612,8 @@ For comparison, a full documentation build may take 15 minutes, but a single
612612
section may take 15 seconds. Subsequent builds, which only process portions
613613
you have changed, will be faster.
614614

615-
You can also specify to use multiple cores to speed up the documentation build::
615+
The build will automatically use the number of cores available on your machine
616+
to speed up the documentation build. You can override this::
616617

617618
python make.py html --num-jobs 4
618619

doc/source/user_guide/gotchas.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,9 @@ testing for membership in the list of column names.
183183
Mutating with User Defined Function (UDF) methods
184184
-------------------------------------------------
185185

186+
This section applies to pandas methods that take a UDF. In particular, the methods
187+
``.apply``, ``.aggregate``, ``.transform``, and ``.filter``.
188+
186189
It is a general rule in programming that one should not mutate a container
187190
while it is being iterated over. Mutation will invalidate the iterator,
188191
causing unexpected behavior. Consider the example:
@@ -246,7 +249,6 @@ not apply to the container being iterated over.
246249
df = pd.DataFrame({"a": [1, 2, 3], 'b': [4, 5, 6]})
247250
df.apply(f, axis="columns")
248251
249-
250252
``NaN``, Integer ``NA`` values and ``NA`` type promotions
251253
---------------------------------------------------------
252254

doc/source/user_guide/io.rst

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2853,14 +2853,12 @@ See the :ref:`cookbook<cookbook.excel>` for some advanced strategies.
28532853
The `xlrd <https://xlrd.readthedocs.io/en/latest/>`__ package is now only for reading
28542854
old-style ``.xls`` files.
28552855

2856-
Before pandas 1.2.0, the default argument ``engine=None`` to :func:`~pandas.read_excel`
2856+
Before pandas 1.3.0, the default argument ``engine=None`` to :func:`~pandas.read_excel`
28572857
would result in using the ``xlrd`` engine in many cases, including new
2858-
Excel 2007+ (``.xlsx``) files.
2859-
If `openpyxl <https://openpyxl.readthedocs.io/en/stable/>`__ is installed,
2860-
many of these cases will now default to using the ``openpyxl`` engine.
2861-
See the :func:`read_excel` documentation for more details.
2858+
Excel 2007+ (``.xlsx``) files. pandas will now default to using the
2859+
`openpyxl <https://openpyxl.readthedocs.io/en/stable/>`__ engine.
28622860

2863-
Thus, it is strongly encouraged to install ``openpyxl`` to read Excel 2007+
2861+
It is strongly encouraged to install ``openpyxl`` to read Excel 2007+
28642862
(``.xlsx``) files.
28652863
**Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.**
28662864
This is no longer supported, switch to using ``openpyxl`` instead.

doc/source/whatsnew/v1.3.0.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,16 @@ including other versions of pandas.
88

99
{{ header }}
1010

11+
.. warning::
12+
13+
When reading new Excel 2007+ (``.xlsx``) files, the default argument
14+
``engine=None`` to :func:`~pandas.read_excel` will now result in using the
15+
`openpyxl <https://openpyxl.readthedocs.io/en/stable/>`_ engine in all cases
16+
when the option :attr:`io.excel.xlsx.reader` is set to ``"auto"``.
17+
Previously, some cases would use the
18+
`xlrd <https://xlrd.readthedocs.io/en/latest/>`_ engine instead. See
19+
:ref:`What's new 1.2.0 <whatsnew_120>` for background on this change.
20+
1121
.. ---------------------------------------------------------------------------
1222
1323
Enhancements
@@ -464,6 +474,7 @@ Other
464474
- Bug in :func:`pandas.testing.assert_series_equal`, :func:`pandas.testing.assert_frame_equal`, :func:`pandas.testing.assert_index_equal` and :func:`pandas.testing.assert_extension_array_equal` incorrectly raising when an attribute has an unrecognized NA type (:issue:`39461`)
465475
- Bug in :class:`Styler` where ``subset`` arg in methods raised an error for some valid multiindex slices (:issue:`33562`)
466476
- :class:`Styler` rendered HTML output minor alterations to support w3 good code standard (:issue:`39626`)
477+
- Bug in :class:`Styler` where rendered HTML was missing a column class identifier for certain header cells (:issue:`39716`)
467478
- Bug in :meth:`DataFrame.equals`, :meth:`Series.equals`, :meth:`Index.equals` with object-dtype containing ``np.datetime64("NaT")`` or ``np.timedelta64("NaT")`` (:issue:`39650`)
468479

469480

pandas/_libs/lib.pyx

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1059,11 +1059,12 @@ def is_list_like(obj: object, allow_sets: bool = True) -> bool:
10591059

10601060
cdef inline bint c_is_list_like(object obj, bint allow_sets) except -1:
10611061
return (
1062-
isinstance(obj, abc.Iterable)
1062+
# equiv: `isinstance(obj, abc.Iterable)`
1063+
hasattr(obj, "__iter__") and not isinstance(obj, type)
10631064
# we do not count strings/unicode/bytes as list-like
10641065
and not isinstance(obj, (str, bytes))
10651066
# exclude zero-dimensional numpy arrays, effectively scalars
1066-
and not (util.is_array(obj) and obj.ndim == 0)
1067+
and not cnp.PyArray_IsZeroDim(obj)
10671068
# exclude sets if allow_sets is False
10681069
and not (allow_sets is False and isinstance(obj, abc.Set))
10691070
)

pandas/_testing/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,8 +215,10 @@ def box_expected(expected, box_cls, transpose=True):
215215
if transpose:
216216
# for vector operations, we need a DataFrame to be a single-row,
217217
# not a single-column, in order to operate against non-DataFrame
218-
# vectors of the same length.
218+
# vectors of the same length. But convert to two rows to avoid
219+
# single-row special cases in datetime arithmetic
219220
expected = expected.T
221+
expected = pd.concat([expected] * 2, ignore_index=True)
220222
elif box_cls is PeriodArray:
221223
# the PeriodArray constructor is not as flexible as period_array
222224
expected = period_array(expected)

pandas/core/array_algos/transforms.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def shift(values: np.ndarray, periods: int, axis: int, fill_value) -> np.ndarray
1919
new_values = new_values.T
2020
axis = new_values.ndim - axis - 1
2121

22-
if np.prod(new_values.shape):
22+
if new_values.size:
2323
new_values = np.roll(new_values, ensure_platform_int(periods), axis=axis)
2424

2525
axis_indexer = [slice(None)] * values.ndim

pandas/core/arrays/numpy_.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
)
1919
from pandas.compat.numpy import function as nv
2020

21+
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
2122
from pandas.core.dtypes.dtypes import PandasDtype
2223
from pandas.core.dtypes.missing import isna
2324

@@ -97,6 +98,14 @@ def _from_sequence(
9798
dtype = dtype._dtype
9899

99100
result = np.asarray(scalars, dtype=dtype)
101+
if (
102+
result.ndim > 1
103+
and not hasattr(scalars, "dtype")
104+
and (dtype is None or dtype == object)
105+
):
106+
# e.g. list-of-tuples
107+
result = construct_1d_object_array_from_listlike(scalars)
108+
100109
if copy and result is scalars:
101110
result = result.copy()
102111
return cls(result)

pandas/core/computation/expressions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check):
8080
if op_str is not None:
8181

8282
# required min elements (otherwise we are adding overhead)
83-
if np.prod(a.shape) > _MIN_ELEMENTS:
83+
if a.size > _MIN_ELEMENTS:
8484
# check for dtype compatibility
8585
dtypes: Set[str] = set()
8686
for o in [a, b]:

pandas/core/dtypes/cast.py

Lines changed: 7 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -609,24 +609,10 @@ def maybe_promote(dtype: np.dtype, fill_value=np.nan):
609609
return np.dtype(object), fill_value
610610

611611
elif issubclass(dtype.type, np.timedelta64):
612-
if (
613-
is_integer(fill_value)
614-
or is_float(fill_value)
615-
or isinstance(fill_value, str)
616-
):
617-
# TODO: What about str that can be a timedelta?
618-
dtype = np.dtype(np.object_)
619-
else:
620-
try:
621-
fv = Timedelta(fill_value)
622-
except ValueError:
623-
dtype = np.dtype(np.object_)
624-
else:
625-
if fv is NaT:
626-
# NaT has no `to_timedelta64` method
627-
fill_value = np.timedelta64("NaT", "ns")
628-
else:
629-
fill_value = fv.to_timedelta64()
612+
inferred, fv = infer_dtype_from_scalar(fill_value, pandas_dtype=True)
613+
if inferred == dtype:
614+
return dtype, fv
615+
return np.dtype(object), fill_value
630616

631617
elif is_float(fill_value):
632618
if issubclass(dtype.type, np.bool_):
@@ -782,11 +768,12 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj,
782768

783769
elif isinstance(val, (np.timedelta64, timedelta)):
784770
try:
785-
val = Timedelta(val).value
771+
val = Timedelta(val)
786772
except (OutOfBoundsTimedelta, OverflowError):
787773
dtype = np.dtype(object)
788774
else:
789775
dtype = np.dtype("m8[ns]")
776+
val = np.timedelta64(val.value, "ns")
790777

791778
elif is_bool(val):
792779
dtype = np.dtype(np.bool_)
@@ -1546,7 +1533,7 @@ def maybe_cast_to_datetime(value, dtype: Optional[DtypeObj]):
15461533
value = iNaT
15471534

15481535
# we have an array of datetime or timedeltas & nulls
1549-
elif np.prod(value.shape) or not is_dtype_equal(value.dtype, dtype):
1536+
elif value.size or not is_dtype_equal(value.dtype, dtype):
15501537
_disallow_mismatched_datetimelike(value, dtype)
15511538

15521539
try:

pandas/core/dtypes/missing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -439,7 +439,7 @@ def array_equivalent(
439439

440440
# NaNs can occur in float and complex arrays.
441441
if is_float_dtype(left.dtype) or is_complex_dtype(left.dtype):
442-
if not (np.prod(left.shape) and np.prod(right.shape)):
442+
if not (left.size and right.size):
443443
return True
444444
return ((left == right) | (isna(left) & isna(right))).all()
445445

pandas/core/frame.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4066,8 +4066,8 @@ def lookup(self, row_labels, col_labels) -> np.ndarray:
40664066
.. deprecated:: 1.2.0
40674067
DataFrame.lookup is deprecated,
40684068
use DataFrame.melt and DataFrame.loc instead.
4069-
For an example see :meth:`~pandas.DataFrame.lookup`
4070-
in the user guide.
4069+
For further details see
4070+
:ref:`Looking up values by index/column labels <indexing.lookup>`.
40714071
40724072
Parameters
40734073
----------

0 commit comments

Comments
 (0)