Skip to content

Commit 22417cf

Browse files
author
MarcoGorelli
committed
Merge remote-tracking branch 'upstream/main' into implementation-pdep-4
2 parents b3e32ac + ab6562a commit 22417cf

File tree

111 files changed

+413
-1020
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

111 files changed

+413
-1020
lines changed

asv_bench/benchmarks/io/sql.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def setup(self, connection):
3838
},
3939
index=tm.makeStringIndex(N),
4040
)
41-
self.df.loc[1000:3000, "float_with_nan"] = np.nan
41+
self.df.iloc[1000:3000, 1] = np.nan
4242
self.df["date"] = self.df["datetime"].dt.date
4343
self.df["time"] = self.df["datetime"].dt.time
4444
self.df["datetime_string"] = self.df["datetime"].astype(str)
@@ -88,7 +88,7 @@ def setup(self, connection, dtype):
8888
},
8989
index=tm.makeStringIndex(N),
9090
)
91-
self.df.loc[1000:3000, "float_with_nan"] = np.nan
91+
self.df.iloc[1000:3000, 1] = np.nan
9292
self.df["date"] = self.df["datetime"].dt.date
9393
self.df["time"] = self.df["datetime"].dt.time
9494
self.df["datetime_string"] = self.df["datetime"].astype(str)
@@ -117,7 +117,7 @@ def setup(self):
117117
},
118118
index=tm.makeStringIndex(N),
119119
)
120-
self.df.loc[1000:3000, "float_with_nan"] = np.nan
120+
self.df.iloc[1000:3000, 1] = np.nan
121121
self.df["date"] = self.df["datetime"].dt.date
122122
self.df["time"] = self.df["datetime"].dt.time
123123
self.df["datetime_string"] = self.df["datetime"].astype(str)
@@ -164,7 +164,7 @@ def setup(self, dtype):
164164
},
165165
index=tm.makeStringIndex(N),
166166
)
167-
self.df.loc[1000:3000, "float_with_nan"] = np.nan
167+
self.df.iloc[1000:3000, 1] = np.nan
168168
self.df["date"] = self.df["datetime"].dt.date
169169
self.df["time"] = self.df["datetime"].dt.time
170170
self.df["datetime_string"] = self.df["datetime"].astype(str)

asv_bench/benchmarks/reshape.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def setup(self):
3636
self.df = DataFrame(data)
3737

3838
def time_reshape_pivot_time_series(self):
39-
self.df.pivot("date", "variable", "value")
39+
self.df.pivot(index="date", columns="variable", values="value")
4040

4141

4242
class SimpleReshape:

doc/scripts/eval_performance.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
from timeit import repeat as timeit
2+
3+
import numpy as np
4+
import seaborn as sns
5+
6+
from pandas import DataFrame
7+
8+
setup_common = """from pandas import DataFrame
9+
from numpy.random import randn
10+
df = DataFrame(randn(%d, 3), columns=list('abc'))
11+
%s"""
12+
13+
setup_with = "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'"
14+
15+
16+
def bench_with(n, times=10, repeat=3, engine="numexpr"):
17+
return (
18+
np.array(
19+
timeit(
20+
"df.eval(s, engine=%r)" % engine,
21+
setup=setup_common % (n, setup_with),
22+
repeat=repeat,
23+
number=times,
24+
)
25+
)
26+
/ times
27+
)
28+
29+
30+
setup_subset = "s = 'a <= b <= c ** 2 + b ** 2 - a and b > c'"
31+
32+
33+
def bench_subset(n, times=20, repeat=3, engine="numexpr"):
34+
return (
35+
np.array(
36+
timeit(
37+
"df.query(s, engine=%r)" % engine,
38+
setup=setup_common % (n, setup_subset),
39+
repeat=repeat,
40+
number=times,
41+
)
42+
)
43+
/ times
44+
)
45+
46+
47+
def bench(mn=3, mx=7, num=100, engines=("python", "numexpr"), verbose=False):
48+
r = np.logspace(mn, mx, num=num).round().astype(int)
49+
50+
ev = DataFrame(np.empty((num, len(engines))), columns=engines)
51+
qu = ev.copy(deep=True)
52+
53+
ev["size"] = qu["size"] = r
54+
55+
for engine in engines:
56+
for i, n in enumerate(r):
57+
if verbose & (i % 10 == 0):
58+
print("engine: %r, i == %d" % (engine, i))
59+
ev_times = bench_with(n, times=1, repeat=1, engine=engine)
60+
ev.loc[i, engine] = np.mean(ev_times)
61+
qu_times = bench_subset(n, times=1, repeat=1, engine=engine)
62+
qu.loc[i, engine] = np.mean(qu_times)
63+
64+
return ev, qu
65+
66+
67+
def plot_perf(df, engines, title, filename=None):
68+
from matplotlib.pyplot import figure
69+
70+
sns.set()
71+
sns.set_palette("Set2")
72+
73+
fig = figure(figsize=(4, 3), dpi=120)
74+
ax = fig.add_subplot(111)
75+
76+
for engine in engines:
77+
ax.loglog(df["size"], df[engine], label=engine, lw=2)
78+
79+
ax.set_xlabel("Number of Rows")
80+
ax.set_ylabel("Time (s)")
81+
ax.set_title(title)
82+
ax.legend(loc="best")
83+
ax.tick_params(top=False, right=False)
84+
85+
fig.tight_layout()
86+
87+
if filename is not None:
88+
fig.savefig(filename)
89+
90+
91+
if __name__ == "__main__":
92+
import os
93+
94+
pandas_dir = os.path.dirname(
95+
os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
96+
)
97+
static_path = os.path.join(pandas_dir, "doc", "source", "_static")
98+
99+
join = lambda p: os.path.join(static_path, p)
100+
101+
fn = join("eval-query-perf-data.h5")
102+
103+
engines = "python", "numexpr"
104+
105+
ev, qu = bench(verbose=True) # only this one
106+
107+
plot_perf(ev, engines, "DataFrame.eval()", filename=join("eval-perf.png"))
108+
plot_perf(qu, engines, "DataFrame.query()", filename=join("query-perf.png"))
-24.7 KB
Binary file not shown.

doc/source/_static/eval-perf.png

10.8 KB
Loading
-21.2 KB
Binary file not shown.

doc/source/_static/query-perf.png

8.79 KB
Loading

doc/source/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@
236236
if ".dev" in version:
237237
switcher_version = "dev"
238238
elif "rc" in version:
239-
switcher_version = version.split("rc")[0] + " (rc)"
239+
switcher_version = version.split("rc", maxsplit=1)[0] + " (rc)"
240240

241241
html_theme_options = {
242242
"external_links": [],

doc/source/development/contributing_environment.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ To test out code changes, you'll need to build pandas from source, which
1010
requires a C/C++ compiler and Python environment. If you're making documentation
1111
changes, you can skip to :ref:`contributing to the documentation <contributing_documentation>` but if you skip
1212
creating the development environment you won't be able to build the documentation
13-
locally before pushing your changes.
13+
locally before pushing your changes. It's recommended to also install the :ref:`pre-commit hooks <contributing.pre-commit>`.
1414

1515
.. contents:: Table of contents:
1616
:local:

doc/source/user_guide/categorical.rst

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -353,11 +353,6 @@ Renaming categories is done by using the
353353

354354
In contrast to R's ``factor``, categorical data can have categories of other types than string.
355355

356-
.. note::
357-
358-
Be aware that assigning new categories is an inplace operation, while most other operations
359-
under ``Series.cat`` per default return a new ``Series`` of dtype ``category``.
360-
361356
Categories must be unique or a ``ValueError`` is raised:
362357

363358
.. ipython:: python
@@ -952,7 +947,6 @@ categorical (categories and ordering). So if you read back the CSV file you have
952947
relevant columns back to ``category`` and assign the right categories and categories ordering.
953948

954949
.. ipython:: python
955-
:okwarning:
956950
957951
import io
958952
@@ -969,8 +963,8 @@ relevant columns back to ``category`` and assign the right categories and catego
969963
df2["cats"]
970964
# Redo the category
971965
df2["cats"] = df2["cats"].astype("category")
972-
df2["cats"].cat.set_categories(
973-
["very bad", "bad", "medium", "good", "very good"], inplace=True
966+
df2["cats"] = df2["cats"].cat.set_categories(
967+
["very bad", "bad", "medium", "good", "very good"]
974968
)
975969
df2.dtypes
976970
df2["cats"]
@@ -1162,16 +1156,12 @@ Constructing a ``Series`` from a ``Categorical`` will not copy the input
11621156
change the original ``Categorical``:
11631157

11641158
.. ipython:: python
1165-
:okwarning:
11661159
11671160
cat = pd.Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10])
11681161
s = pd.Series(cat, name="cat")
11691162
cat
11701163
s.iloc[0:2] = 10
11711164
cat
1172-
df = pd.DataFrame(s)
1173-
df["cat"].cat.categories = [1, 2, 3, 4, 5]
1174-
cat
11751165
11761166
Use ``copy=True`` to prevent such a behaviour or simply don't reuse ``Categoricals``:
11771167

doc/source/user_guide/enhancingperf.rst

Lines changed: 5 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -690,21 +690,12 @@ The equivalent in standard Python would be
690690
df["a"] = 1
691691
df
692692
693-
The :class:`DataFrame.query` method has a ``inplace`` keyword which determines
694-
whether the query modifies the original frame.
695-
696-
.. ipython:: python
697-
698-
df = pd.DataFrame(dict(a=range(5), b=range(5, 10)))
699-
df.query("a > 2")
700-
df.query("a > 2", inplace=True)
701-
df
702-
703693
Local variables
704694
~~~~~~~~~~~~~~~
705695

706696
You must *explicitly reference* any local variable that you want to use in an
707-
expression by placing the ``@`` character in front of the name. For example,
697+
expression by placing the ``@`` character in front of the name. This mechanism is
698+
the same for both :meth:`DataFrame.query` and :meth:`DataFrame.eval`. For example,
708699

709700
.. ipython:: python
710701
@@ -820,17 +811,12 @@ significant performance benefit. Here is a plot showing the running time of
820811
:func:`pandas.eval` as function of the size of the frame involved in the
821812
computation. The two lines are two different engines.
822813

814+
..
815+
The eval-perf.png figure below was generated with /doc/scripts/eval_performance.py
823816
824817
.. image:: ../_static/eval-perf.png
825818

826-
827-
.. note::
828-
829-
Operations with smallish objects (around 15k-20k rows) are faster using
830-
plain Python:
831-
832-
.. image:: ../_static/eval-perf-small.png
833-
819+
You will only see the performance benefits of using the ``numexpr`` engine with :func:`pandas.eval` if your frame has more than approximately 100,000 rows.
834820

835821
This plot was created using a :class:`DataFrame` with 3 columns each containing
836822
floating point values generated using ``numpy.random.randn()``.

doc/source/user_guide/indexing.rst

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1240,6 +1240,17 @@ If instead you don't want to or cannot name your index, you can use the name
12401240
renaming your columns to something less ambiguous.
12411241

12421242

1243+
The :class:`DataFrame.query` method has a ``inplace`` keyword which determines
1244+
whether the query modifies the original frame.
1245+
1246+
.. ipython:: python
1247+
1248+
df = pd.DataFrame(dict(a=range(5), b=range(5, 10)))
1249+
df.query("a > 2")
1250+
df.query("a > 2", inplace=True)
1251+
df
1252+
1253+
12431254
:class:`~pandas.MultiIndex` :meth:`~pandas.DataFrame.query` Syntax
12441255
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
12451256

@@ -1438,15 +1449,18 @@ Performance of :meth:`~pandas.DataFrame.query`
14381449
``DataFrame.query()`` using ``numexpr`` is slightly faster than Python for
14391450
large frames.
14401451

1452+
..
1453+
The eval-perf.png figure below was generated with /doc/scripts/eval_performance.py
1454+
14411455
.. image:: ../_static/query-perf.png
14421456

1443-
.. note::
14441457

1445-
You will only see the performance benefits of using the ``numexpr`` engine
1446-
with ``DataFrame.query()`` if your frame has more than approximately 200,000
1447-
rows.
14481458

1449-
.. image:: ../_static/query-perf-small.png
1459+
You will only see the performance benefits of using the ``numexpr`` engine
1460+
with ``DataFrame.query()`` if your frame has more than approximately 100,000
1461+
rows.
1462+
1463+
14501464

14511465
This plot was created using a ``DataFrame`` with 3 columns each containing
14521466
floating point values generated using ``numpy.random.randn()``.

doc/source/whatsnew/v0.15.0.rst

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ For full docs, see the :ref:`categorical introduction <categorical>` and the
7070
:ref:`API documentation <api.arrays.categorical>`.
7171

7272
.. ipython:: python
73-
:okwarning:
7473
7574
df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6],
7675
"raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']})
@@ -79,7 +78,7 @@ For full docs, see the :ref:`categorical introduction <categorical>` and the
7978
df["grade"]
8079
8180
# Rename the categories
82-
df["grade"].cat.categories = ["very good", "good", "very bad"]
81+
df["grade"] = df["grade"].cat.rename_categories(["very good", "good", "very bad"])
8382
8483
# Reorder the categories and simultaneously add the missing categories
8584
df["grade"] = df["grade"].cat.set_categories(["very bad", "bad",

doc/source/whatsnew/v0.19.0.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -271,12 +271,12 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification
271271
such as :func:`to_datetime`.
272272

273273
.. ipython:: python
274-
:okwarning:
275274
276275
df = pd.read_csv(StringIO(data), dtype="category")
277276
df.dtypes
278277
df["col3"]
279-
df["col3"].cat.categories = pd.to_numeric(df["col3"].cat.categories)
278+
new_categories = pd.to_numeric(df["col3"].cat.categories)
279+
df["col3"] = df["col3"].cat.rename_categories(new_categories)
280280
df["col3"]
281281
282282
.. _whatsnew_0190.enhancements.union_categoricals:

doc/source/whatsnew/v2.0.0.rst

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,13 @@ Removal of prior version deprecations/changes
229229
- Disallow passing non-round floats to :class:`Timestamp` with ``unit="M"`` or ``unit="Y"`` (:issue:`47266`)
230230
- Remove keywords ``convert_float`` and ``mangle_dupe_cols`` from :func:`read_excel` (:issue:`41176`)
231231
- Disallow passing non-keyword arguments to :func:`read_excel` except ``io`` and ``sheet_name`` (:issue:`34418`)
232+
- Disallow passing non-keyword arguments to :meth:`DataFrame.set_index` except ``keys`` (:issue:`41495`)
233+
- Disallow passing non-keyword arguments to :meth:`Resampler.interpolate` except ``method`` (:issue:`41699`)
234+
- Disallow passing non-keyword arguments to :meth:`DataFrame.reset_index` and :meth:`Series.reset_index` except ``level`` (:issue:`41496`)
235+
- Disallow passing non-keyword arguments to :meth:`DataFrame.dropna` and :meth:`Series.dropna` (:issue:`41504`)
236+
- Disallow passing non-keyword arguments to :meth:`ExtensionArray.argsort` (:issue:`46134`)
237+
- Disallow passing non-keyword arguments to :meth:`Categorical.sort_values` (:issue:`47618`)
238+
- Disallow passing non-keyword arguments to :meth:`Index.drop_duplicates` and :meth:`Series.drop_duplicates` (:issue:`41485`)
232239
- Disallow passing non-keyword arguments to :meth:`DataFrame.drop_duplicates` except for ``subset`` (:issue:`41485`)
233240
- Disallow passing non-keyword arguments to :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` (:issue:`41506`)
234241
- Disallow passing non-keyword arguments to :meth:`DataFrame.interpolate` and :meth:`Series.interpolate` except for ``method`` (:issue:`41510`)
@@ -241,6 +248,9 @@ Removal of prior version deprecations/changes
241248
- Disallow passing non-keyword arguments to :func:`read_json` except for ``path_or_buf`` (:issue:`27573`)
242249
- Disallow passing non-keyword arguments to :func:`read_sas` except for ``filepath_or_buffer`` (:issue:`47154`)
243250
- Disallow passing non-keyword arguments to :func:`read_stata` except for ``filepath_or_buffer`` (:issue:`48128`)
251+
- Disallow passing non-keyword arguments to :func:`read_csv` except ``filepath_or_buffer`` (:issue:`41485`)
252+
- Disallow passing non-keyword arguments to :func:`read_table` except ``filepath_or_buffer`` (:issue:`41485`)
253+
- Disallow passing non-keyword arguments to :func:`read_fwf` except ``filepath_or_buffer`` (:issue:`44710`)
244254
- Disallow passing non-keyword arguments to :func:`read_xml` except for ``path_or_buffer`` (:issue:`45133`)
245255
- Disallow passing non-keyword arguments to :meth:`Series.mask` and :meth:`DataFrame.mask` except ``cond`` and ``other`` (:issue:`41580`)
246256
- Disallow passing non-keyword arguments to :meth:`DataFrame.to_stata` except for ``path`` (:issue:`48128`)
@@ -275,8 +285,10 @@ Removal of prior version deprecations/changes
275285
- Removed :meth:`Series.str.__iter__` (:issue:`28277`)
276286
- Removed ``pandas.SparseArray`` in favor of :class:`arrays.SparseArray` (:issue:`30642`)
277287
- Removed ``pandas.SparseSeries`` and ``pandas.SparseDataFrame``, including pickle support. (:issue:`30642`)
288+
- Enforced disallowing passing an integer ``fill_value`` to :meth:`DataFrame.shift` and :meth:`Series.shift`` with datetime64, timedelta64, or period dtypes (:issue:`32591`)
278289
- Enforced disallowing a string column label into ``times`` in :meth:`DataFrame.ewm` (:issue:`43265`)
279290
- Enforced disallowing a tuple of column labels into :meth:`.DataFrameGroupBy.__getitem__` (:issue:`30546`)
291+
- Enforced disallowing setting values with ``.loc`` using a positional slice. Use ``.loc`` with labels or ``.iloc`` with positions instead (:issue:`31840`)
280292
- Removed setting Categorical._codes directly (:issue:`41429`)
281293
- Enforced :meth:`Rolling.count` with ``min_periods=None`` to default to the size of the window (:issue:`31302`)
282294
- Renamed ``fname`` to ``path`` in :meth:`DataFrame.to_parquet`, :meth:`DataFrame.to_stata` and :meth:`DataFrame.to_feather` (:issue:`30338`)
@@ -290,6 +302,8 @@ Removal of prior version deprecations/changes
290302
- Changed behavior of :class:`DataFrame` constructor when passed a ``dtype`` (other than int) that the data cannot be cast to; it now raises instead of silently ignoring the dtype (:issue:`41733`)
291303
- Changed the behavior of :class:`Series` constructor, it will no longer infer a datetime64 or timedelta64 dtype from string entries (:issue:`41731`)
292304
- Changed behavior of :class:`Index` constructor when passed a ``SparseArray`` or ``SparseDtype`` to retain that dtype instead of casting to ``numpy.ndarray`` (:issue:`43930`)
305+
- Changed behavior of :meth:`DataFrame.any` and :meth:`DataFrame.all` with ``bool_only=True``; object-dtype columns with all-bool values will no longer be included, manually cast to ``bool`` dtype first (:issue:`46188`)
306+
-
293307

294308
.. ---------------------------------------------------------------------------
295309
.. _whatsnew_200.performance:

pandas/_libs/parsers.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ from pandas._libs.util cimport (
7474
UINT64_MAX,
7575
)
7676

77-
import pandas._libs.lib as lib
77+
from pandas._libs import lib
7878

7979
from pandas._libs.khash cimport (
8080
kh_destroy_float64,

0 commit comments

Comments
 (0)