Skip to content

Commit afc4d96

Browse files
author
MarcoGorelli
committed
Merge remote-tracking branch 'upstream/main' into pr/nikitaved/qssummer/format_iso
2 parents 2e21e71 + 7b39329 commit afc4d96

File tree

121 files changed

+426
-1574
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

121 files changed

+426
-1574
lines changed

asv_bench/benchmarks/io/sql.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def setup(self, connection):
3838
},
3939
index=tm.makeStringIndex(N),
4040
)
41-
self.df.loc[1000:3000, "float_with_nan"] = np.nan
41+
self.df.iloc[1000:3000, 1] = np.nan
4242
self.df["date"] = self.df["datetime"].dt.date
4343
self.df["time"] = self.df["datetime"].dt.time
4444
self.df["datetime_string"] = self.df["datetime"].astype(str)
@@ -88,7 +88,7 @@ def setup(self, connection, dtype):
8888
},
8989
index=tm.makeStringIndex(N),
9090
)
91-
self.df.loc[1000:3000, "float_with_nan"] = np.nan
91+
self.df.iloc[1000:3000, 1] = np.nan
9292
self.df["date"] = self.df["datetime"].dt.date
9393
self.df["time"] = self.df["datetime"].dt.time
9494
self.df["datetime_string"] = self.df["datetime"].astype(str)
@@ -117,7 +117,7 @@ def setup(self):
117117
},
118118
index=tm.makeStringIndex(N),
119119
)
120-
self.df.loc[1000:3000, "float_with_nan"] = np.nan
120+
self.df.iloc[1000:3000, 1] = np.nan
121121
self.df["date"] = self.df["datetime"].dt.date
122122
self.df["time"] = self.df["datetime"].dt.time
123123
self.df["datetime_string"] = self.df["datetime"].astype(str)
@@ -164,7 +164,7 @@ def setup(self, dtype):
164164
},
165165
index=tm.makeStringIndex(N),
166166
)
167-
self.df.loc[1000:3000, "float_with_nan"] = np.nan
167+
self.df.iloc[1000:3000, 1] = np.nan
168168
self.df["date"] = self.df["datetime"].dt.date
169169
self.df["time"] = self.df["datetime"].dt.time
170170
self.df["datetime_string"] = self.df["datetime"].astype(str)

asv_bench/benchmarks/reshape.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def setup(self):
3636
self.df = DataFrame(data)
3737

3838
def time_reshape_pivot_time_series(self):
39-
self.df.pivot("date", "variable", "value")
39+
self.df.pivot(index="date", columns="variable", values="value")
4040

4141

4242
class SimpleReshape:

doc/scripts/eval_performance.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
from timeit import repeat as timeit
2+
3+
import numpy as np
4+
import seaborn as sns
5+
6+
from pandas import DataFrame
7+
8+
setup_common = """from pandas import DataFrame
9+
from numpy.random import randn
10+
df = DataFrame(randn(%d, 3), columns=list('abc'))
11+
%s"""
12+
13+
setup_with = "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'"
14+
15+
16+
def bench_with(n, times=10, repeat=3, engine="numexpr"):
17+
return (
18+
np.array(
19+
timeit(
20+
"df.eval(s, engine=%r)" % engine,
21+
setup=setup_common % (n, setup_with),
22+
repeat=repeat,
23+
number=times,
24+
)
25+
)
26+
/ times
27+
)
28+
29+
30+
setup_subset = "s = 'a <= b <= c ** 2 + b ** 2 - a and b > c'"
31+
32+
33+
def bench_subset(n, times=20, repeat=3, engine="numexpr"):
34+
return (
35+
np.array(
36+
timeit(
37+
"df.query(s, engine=%r)" % engine,
38+
setup=setup_common % (n, setup_subset),
39+
repeat=repeat,
40+
number=times,
41+
)
42+
)
43+
/ times
44+
)
45+
46+
47+
def bench(mn=3, mx=7, num=100, engines=("python", "numexpr"), verbose=False):
48+
r = np.logspace(mn, mx, num=num).round().astype(int)
49+
50+
ev = DataFrame(np.empty((num, len(engines))), columns=engines)
51+
qu = ev.copy(deep=True)
52+
53+
ev["size"] = qu["size"] = r
54+
55+
for engine in engines:
56+
for i, n in enumerate(r):
57+
if verbose & (i % 10 == 0):
58+
print("engine: %r, i == %d" % (engine, i))
59+
ev_times = bench_with(n, times=1, repeat=1, engine=engine)
60+
ev.loc[i, engine] = np.mean(ev_times)
61+
qu_times = bench_subset(n, times=1, repeat=1, engine=engine)
62+
qu.loc[i, engine] = np.mean(qu_times)
63+
64+
return ev, qu
65+
66+
67+
def plot_perf(df, engines, title, filename=None):
68+
from matplotlib.pyplot import figure
69+
70+
sns.set()
71+
sns.set_palette("Set2")
72+
73+
fig = figure(figsize=(4, 3), dpi=120)
74+
ax = fig.add_subplot(111)
75+
76+
for engine in engines:
77+
ax.loglog(df["size"], df[engine], label=engine, lw=2)
78+
79+
ax.set_xlabel("Number of Rows")
80+
ax.set_ylabel("Time (s)")
81+
ax.set_title(title)
82+
ax.legend(loc="best")
83+
ax.tick_params(top=False, right=False)
84+
85+
fig.tight_layout()
86+
87+
if filename is not None:
88+
fig.savefig(filename)
89+
90+
91+
if __name__ == "__main__":
92+
import os
93+
94+
pandas_dir = os.path.dirname(
95+
os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
96+
)
97+
static_path = os.path.join(pandas_dir, "doc", "source", "_static")
98+
99+
join = lambda p: os.path.join(static_path, p)
100+
101+
fn = join("eval-query-perf-data.h5")
102+
103+
engines = "python", "numexpr"
104+
105+
ev, qu = bench(verbose=True) # only this one
106+
107+
plot_perf(ev, engines, "DataFrame.eval()", filename=join("eval-perf.png"))
108+
plot_perf(qu, engines, "DataFrame.query()", filename=join("query-perf.png"))
-24.7 KB
Binary file not shown.

doc/source/_static/eval-perf.png

10.8 KB
Loading
-21.2 KB
Binary file not shown.

doc/source/_static/query-perf.png

8.79 KB
Loading

doc/source/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@
236236
if ".dev" in version:
237237
switcher_version = "dev"
238238
elif "rc" in version:
239-
switcher_version = version.split("rc")[0] + " (rc)"
239+
switcher_version = version.split("rc", maxsplit=1)[0] + " (rc)"
240240

241241
html_theme_options = {
242242
"external_links": [],

doc/source/getting_started/intro_tutorials/09_timeseries.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ I want to add a new column to the ``DataFrame`` containing only the month of the
144144
145145
By using ``Timestamp`` objects for dates, a lot of time-related
146146
properties are provided by pandas. For example the ``month``, but also
147-
``year``, ``weekofyear``, ``quarter``,… All of these properties are
147+
``year``, ``quarter``,… All of these properties are
148148
accessible by the ``dt`` accessor.
149149

150150
.. raw:: html

doc/source/reference/indexing.rst

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -343,8 +343,6 @@ Time/date components
343343
DatetimeIndex.timetz
344344
DatetimeIndex.dayofyear
345345
DatetimeIndex.day_of_year
346-
DatetimeIndex.weekofyear
347-
DatetimeIndex.week
348346
DatetimeIndex.dayofweek
349347
DatetimeIndex.day_of_week
350348
DatetimeIndex.weekday

doc/source/reference/series.rst

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -311,8 +311,6 @@ Datetime properties
311311
Series.dt.second
312312
Series.dt.microsecond
313313
Series.dt.nanosecond
314-
Series.dt.week
315-
Series.dt.weekofyear
316314
Series.dt.dayofweek
317315
Series.dt.day_of_week
318316
Series.dt.weekday

doc/source/user_guide/categorical.rst

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -353,11 +353,6 @@ Renaming categories is done by using the
353353

354354
In contrast to R's ``factor``, categorical data can have categories of other types than string.
355355

356-
.. note::
357-
358-
Be aware that assigning new categories is an inplace operation, while most other operations
359-
under ``Series.cat`` per default return a new ``Series`` of dtype ``category``.
360-
361356
Categories must be unique or a ``ValueError`` is raised:
362357

363358
.. ipython:: python
@@ -952,7 +947,6 @@ categorical (categories and ordering). So if you read back the CSV file you have
952947
relevant columns back to ``category`` and assign the right categories and categories ordering.
953948

954949
.. ipython:: python
955-
:okwarning:
956950
957951
import io
958952
@@ -969,8 +963,8 @@ relevant columns back to ``category`` and assign the right categories and catego
969963
df2["cats"]
970964
# Redo the category
971965
df2["cats"] = df2["cats"].astype("category")
972-
df2["cats"].cat.set_categories(
973-
["very bad", "bad", "medium", "good", "very good"], inplace=True
966+
df2["cats"] = df2["cats"].cat.set_categories(
967+
["very bad", "bad", "medium", "good", "very good"]
974968
)
975969
df2.dtypes
976970
df2["cats"]
@@ -1162,16 +1156,12 @@ Constructing a ``Series`` from a ``Categorical`` will not copy the input
11621156
change the original ``Categorical``:
11631157

11641158
.. ipython:: python
1165-
:okwarning:
11661159
11671160
cat = pd.Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10])
11681161
s = pd.Series(cat, name="cat")
11691162
cat
11701163
s.iloc[0:2] = 10
11711164
cat
1172-
df = pd.DataFrame(s)
1173-
df["cat"].cat.categories = [1, 2, 3, 4, 5]
1174-
cat
11751165
11761166
Use ``copy=True`` to prevent such a behaviour or simply don't reuse ``Categoricals``:
11771167

doc/source/user_guide/enhancingperf.rst

Lines changed: 5 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -690,21 +690,12 @@ The equivalent in standard Python would be
690690
df["a"] = 1
691691
df
692692
693-
The :class:`DataFrame.query` method has a ``inplace`` keyword which determines
694-
whether the query modifies the original frame.
695-
696-
.. ipython:: python
697-
698-
df = pd.DataFrame(dict(a=range(5), b=range(5, 10)))
699-
df.query("a > 2")
700-
df.query("a > 2", inplace=True)
701-
df
702-
703693
Local variables
704694
~~~~~~~~~~~~~~~
705695

706696
You must *explicitly reference* any local variable that you want to use in an
707-
expression by placing the ``@`` character in front of the name. For example,
697+
expression by placing the ``@`` character in front of the name. This mechanism is
698+
the same for both :meth:`DataFrame.query` and :meth:`DataFrame.eval`. For example,
708699

709700
.. ipython:: python
710701
@@ -820,17 +811,12 @@ significant performance benefit. Here is a plot showing the running time of
820811
:func:`pandas.eval` as function of the size of the frame involved in the
821812
computation. The two lines are two different engines.
822813

814+
..
815+
The eval-perf.png figure below was generated with /doc/scripts/eval_performance.py
823816
824817
.. image:: ../_static/eval-perf.png
825818

826-
827-
.. note::
828-
829-
Operations with smallish objects (around 15k-20k rows) are faster using
830-
plain Python:
831-
832-
.. image:: ../_static/eval-perf-small.png
833-
819+
You will only see the performance benefits of using the ``numexpr`` engine with :func:`pandas.eval` if your frame has more than approximately 100,000 rows.
834820

835821
This plot was created using a :class:`DataFrame` with 3 columns each containing
836822
floating point values generated using ``numpy.random.randn()``.

doc/source/user_guide/indexing.rst

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1240,6 +1240,17 @@ If instead you don't want to or cannot name your index, you can use the name
12401240
renaming your columns to something less ambiguous.
12411241

12421242

1243+
The :class:`DataFrame.query` method has a ``inplace`` keyword which determines
1244+
whether the query modifies the original frame.
1245+
1246+
.. ipython:: python
1247+
1248+
df = pd.DataFrame(dict(a=range(5), b=range(5, 10)))
1249+
df.query("a > 2")
1250+
df.query("a > 2", inplace=True)
1251+
df
1252+
1253+
12431254
:class:`~pandas.MultiIndex` :meth:`~pandas.DataFrame.query` Syntax
12441255
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
12451256

@@ -1438,15 +1449,18 @@ Performance of :meth:`~pandas.DataFrame.query`
14381449
``DataFrame.query()`` using ``numexpr`` is slightly faster than Python for
14391450
large frames.
14401451

1452+
..
1453+
The eval-perf.png figure below was generated with /doc/scripts/eval_performance.py
1454+
14411455
.. image:: ../_static/query-perf.png
14421456

1443-
.. note::
14441457

1445-
You will only see the performance benefits of using the ``numexpr`` engine
1446-
with ``DataFrame.query()`` if your frame has more than approximately 200,000
1447-
rows.
14481458

1449-
.. image:: ../_static/query-perf-small.png
1459+
You will only see the performance benefits of using the ``numexpr`` engine
1460+
with ``DataFrame.query()`` if your frame has more than approximately 100,000
1461+
rows.
1462+
1463+
14501464

14511465
This plot was created using a ``DataFrame`` with 3 columns each containing
14521466
floating point values generated using ``numpy.random.randn()``.

doc/source/whatsnew/v0.15.0.rst

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ For full docs, see the :ref:`categorical introduction <categorical>` and the
7070
:ref:`API documentation <api.arrays.categorical>`.
7171

7272
.. ipython:: python
73-
:okwarning:
7473
7574
df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6],
7675
"raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']})
@@ -79,7 +78,7 @@ For full docs, see the :ref:`categorical introduction <categorical>` and the
7978
df["grade"]
8079
8180
# Rename the categories
82-
df["grade"].cat.categories = ["very good", "good", "very bad"]
81+
df["grade"] = df["grade"].cat.rename_categories(["very good", "good", "very bad"])
8382
8483
# Reorder the categories and simultaneously add the missing categories
8584
df["grade"] = df["grade"].cat.set_categories(["very bad", "bad",

doc/source/whatsnew/v0.19.0.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -271,12 +271,12 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification
271271
such as :func:`to_datetime`.
272272

273273
.. ipython:: python
274-
:okwarning:
275274
276275
df = pd.read_csv(StringIO(data), dtype="category")
277276
df.dtypes
278277
df["col3"]
279-
df["col3"].cat.categories = pd.to_numeric(df["col3"].cat.categories)
278+
new_categories = pd.to_numeric(df["col3"].cat.categories)
279+
df["col3"] = df["col3"].cat.rename_categories(new_categories)
280280
df["col3"]
281281
282282
.. _whatsnew_0190.enhancements.union_categoricals:

0 commit comments

Comments
 (0)