Skip to content

Commit 8003d10

Browse files
committed
Merge branch 'master' into 24893-pivot_table
2 parents 45ffc77 + 2d65e38 commit 8003d10

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+1036
-454
lines changed

.pre-commit-config.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,3 @@ repos:
1515
hooks:
1616
- id: isort
1717
language: python_venv
18-
- repo: https://github.com/asottile/seed-isort-config
19-
rev: v1.9.2
20-
hooks:
21-
- id: seed-isort-config

asv_bench/asv.conf.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,13 @@
5050
"xlsxwriter": [],
5151
"xlrd": [],
5252
"xlwt": [],
53+
"odfpy": [],
5354
"pytest": [],
5455
// If using Windows with python 2.7 and want to build using the
5556
// mingw toolchain (rather than MSVC), uncomment the following line.
5657
// "libpython": [],
5758
},
58-
59+
"conda_channels": ["defaults", "conda-forge"],
5960
// Combinations of libraries/python versions can be excluded/included
6061
// from the set to test. Each entry is a dictionary containing additional
6162
// key-value pairs to include/exclude.

asv_bench/benchmarks/io/excel.py

Lines changed: 54 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,72 @@
11
from io import BytesIO
22

33
import numpy as np
4+
from odf.opendocument import OpenDocumentSpreadsheet
5+
from odf.table import Table, TableCell, TableRow
6+
from odf.text import P
47

58
from pandas import DataFrame, ExcelWriter, date_range, read_excel
69
import pandas.util.testing as tm
710

811

9-
class Excel:
12+
def _generate_dataframe():
13+
N = 2000
14+
C = 5
15+
df = DataFrame(
16+
np.random.randn(N, C),
17+
columns=["float{}".format(i) for i in range(C)],
18+
index=date_range("20000101", periods=N, freq="H"),
19+
)
20+
df["object"] = tm.makeStringIndex(N)
21+
return df
22+
23+
24+
class WriteExcel:
1025

1126
params = ["openpyxl", "xlsxwriter", "xlwt"]
1227
param_names = ["engine"]
1328

1429
def setup(self, engine):
15-
N = 2000
16-
C = 5
17-
self.df = DataFrame(
18-
np.random.randn(N, C),
19-
columns=["float{}".format(i) for i in range(C)],
20-
index=date_range("20000101", periods=N, freq="H"),
21-
)
22-
self.df["object"] = tm.makeStringIndex(N)
23-
self.bio_read = BytesIO()
24-
self.writer_read = ExcelWriter(self.bio_read, engine=engine)
25-
self.df.to_excel(self.writer_read, sheet_name="Sheet1")
26-
self.writer_read.save()
27-
self.bio_read.seek(0)
28-
29-
def time_read_excel(self, engine):
30-
read_excel(self.bio_read)
30+
self.df = _generate_dataframe()
3131

3232
def time_write_excel(self, engine):
33-
bio_write = BytesIO()
34-
bio_write.seek(0)
35-
writer_write = ExcelWriter(bio_write, engine=engine)
36-
self.df.to_excel(writer_write, sheet_name="Sheet1")
37-
writer_write.save()
33+
bio = BytesIO()
34+
bio.seek(0)
35+
writer = ExcelWriter(bio, engine=engine)
36+
self.df.to_excel(writer, sheet_name="Sheet1")
37+
writer.save()
38+
39+
40+
class ReadExcel:
41+
42+
params = ["xlrd", "openpyxl", "odf"]
43+
param_names = ["engine"]
44+
fname_excel = "spreadsheet.xlsx"
45+
fname_odf = "spreadsheet.ods"
46+
47+
def _create_odf(self):
48+
doc = OpenDocumentSpreadsheet()
49+
table = Table(name="Table1")
50+
for row in self.df.values:
51+
tr = TableRow()
52+
for val in row:
53+
tc = TableCell(valuetype="string")
54+
tc.addElement(P(text=val))
55+
tr.addElement(tc)
56+
table.addElement(tr)
57+
58+
doc.spreadsheet.addElement(table)
59+
doc.save(self.fname_odf)
60+
61+
def setup_cache(self):
62+
self.df = _generate_dataframe()
63+
64+
self.df.to_excel(self.fname_excel, sheet_name="Sheet1")
65+
self._create_odf()
66+
67+
def time_read_excel(self, engine):
68+
fname = self.fname_odf if engine == "odf" else self.fname_excel
69+
read_excel(fname, engine=engine)
3870

3971

4072
from ..pandas_vb_common import setup # noqa: F401 isort:skip

asv_bench/benchmarks/io/json.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,15 +118,15 @@ def setup(self, orient, frame):
118118
def time_to_json(self, orient, frame):
119119
getattr(self, frame).to_json(self.fname, orient=orient)
120120

121-
def mem_to_json(self, orient, frame):
121+
def peakmem_to_json(self, orient, frame):
122122
getattr(self, frame).to_json(self.fname, orient=orient)
123123

124124
def time_to_json_wide(self, orient, frame):
125125
base_df = getattr(self, frame).copy()
126126
df = concat([base_df.iloc[:100]] * 1000, ignore_index=True, axis=1)
127127
df.to_json(self.fname, orient=orient)
128128

129-
def mem_to_json_wide(self, orient, frame):
129+
def peakmem_to_json_wide(self, orient, frame):
130130
base_df = getattr(self, frame).copy()
131131
df = concat([base_df.iloc[:100]] * 1000, ignore_index=True, axis=1)
132132
df.to_json(self.fname, orient=orient)

asv_bench/benchmarks/package.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
"""
2+
Benchmarks for pandas at the package-level.
3+
"""
4+
import subprocess
5+
import sys
6+
7+
from pandas.compat import PY37
8+
9+
10+
class TimeImport:
11+
def time_import(self):
12+
if PY37:
13+
# on py37+ we the "-X importtime" usage gives us a more precise
14+
# measurement of the import time we actually care about,
15+
# without the subprocess or interpreter overhead
16+
cmd = [sys.executable, "-X", "importtime", "-c", "import pandas as pd"]
17+
p = subprocess.run(cmd, stderr=subprocess.PIPE)
18+
19+
line = p.stderr.splitlines()[-1]
20+
field = line.split(b"|")[-2].strip()
21+
total = int(field) # microseconds
22+
return total
23+
24+
cmd = [sys.executable, "-c", "import pandas as pd"]
25+
subprocess.run(cmd, stderr=subprocess.PIPE)

asv_bench/benchmarks/rolling.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ def setup(self, constructor, window, dtype, method):
2121
def time_rolling(self, constructor, window, dtype, method):
2222
getattr(self.roll, method)()
2323

24+
def peakmem_rolling(self, constructor, window, dtype, method):
25+
getattr(self.roll, method)()
26+
2427

2528
class ExpandingMethods:
2629

ci/code_checks.sh

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -203,10 +203,14 @@ if [[ -z "$CHECK" || "$CHECK" == "code" ]]; then
203203
import sys
204204
import pandas
205205
206-
blacklist = {'bs4', 'gcsfs', 'html5lib', 'ipython', 'jinja2' 'hypothesis',
206+
blacklist = {'bs4', 'gcsfs', 'html5lib', 'http', 'ipython', 'jinja2', 'hypothesis',
207207
'lxml', 'numexpr', 'openpyxl', 'py', 'pytest', 's3fs', 'scipy',
208-
'tables', 'xlrd', 'xlsxwriter', 'xlwt'}
209-
mods = blacklist & set(m.split('.')[0] for m in sys.modules)
208+
'tables', 'urllib.request', 'xlrd', 'xlsxwriter', 'xlwt'}
209+
210+
# GH#28227 for some of these check for top-level modules, while others are
211+
# more specific (e.g. urllib.request)
212+
import_mods = set(m.split('.')[0] for m in sys.modules) | set(sys.modules)
213+
mods = blacklist & import_mods
210214
if mods:
211215
sys.stderr.write('err: pandas should not import: {}\n'.format(', '.join(mods)))
212216
sys.exit(len(mods))

doc/source/reference/plotting.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,14 @@ The following functions are contained in the `pandas.plotting` module.
1313
:toctree: api/
1414

1515
andrews_curves
16+
autocorrelation_plot
1617
bootstrap_plot
18+
boxplot
1719
deregister_matplotlib_converters
1820
lag_plot
1921
parallel_coordinates
22+
plot_params
2023
radviz
2124
register_matplotlib_converters
2225
scatter_matrix
26+
table

doc/source/user_guide/io.rst

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5047,6 +5047,17 @@ Example of a callable using PostgreSQL `COPY clause
50475047
from io import StringIO
50485048

50495049
def psql_insert_copy(table, conn, keys, data_iter):
5050+
"""
5051+
Execute SQL statement inserting data
5052+
5053+
Parameters
5054+
----------
5055+
table : pandas.io.sql.SQLTable
5056+
conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection
5057+
keys : list of str
5058+
Column names
5059+
data_iter : Iterable that iterates the values to be inserted
5060+
"""
50505061
# gets a DBAPI connection that can provide a cursor
50515062
dbapi_conn = conn.connection
50525063
with dbapi_conn.cursor() as cur:
@@ -5080,6 +5091,18 @@ table name and optionally a subset of columns to read.
50805091
50815092
pd.read_sql_table('data', engine)
50825093
5094+
.. note::
5095+
5096+
Note that pandas infers column dtypes from query outputs, and not by looking
5097+
up data types in the physical database schema. For example, assume ``userid``
5098+
is an integer column in a table. Then, intuitively, ``select userid ...`` will
5099+
return integer-valued series, while ``select cast(userid as text) ...`` will
5100+
return object-valued (str) series. Accordingly, if the query output is empty,
5101+
then all resulting columns will be returned as object-valued (since they are
5102+
most general). If you foresee that your query will sometimes generate an empty
5103+
result, you may want to explicitly typecast afterwards to ensure dtype
5104+
integrity.
5105+
50835106
You can also specify the name of the column as the ``DataFrame`` index,
50845107
and specify a subset of columns to be read.
50855108

doc/source/user_guide/options.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ determines how many rows are shown in the truncated repr.
163163
.. ipython:: python
164164
165165
pd.set_option('max_rows', 8)
166-
pd.set_option('max_rows', 4)
166+
pd.set_option('min_rows', 4)
167167
# below max_rows -> all rows shown
168168
df = pd.DataFrame(np.random.randn(7, 2))
169169
df

doc/source/whatsnew/v1.0.0.rst

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,7 @@ including other versions of pandas.
2020

2121
Enhancements
2222
~~~~~~~~~~~~
23-
24-
-
23+
- :meth:`DataFrame.to_latex` now accepts ``caption`` and ``label`` arguments (:issue:`25436`)
2524
-
2625

2726
.. _whatsnew_1000.enhancements.other:
@@ -162,7 +161,7 @@ I/O
162161

163162
- :meth:`read_csv` now accepts binary mode file buffers when using the Python csv engine (:issue:`23779`)
164163
- Bug in :meth:`DataFrame.to_json` where using a Tuple as a column or index value and using ``orient="columns"`` or ``orient="index"`` would produce invalid JSON (:issue:`20500`)
165-
-
164+
- Improve infinity parsing. :meth:`read_csv` now interprets ``Infinity``, ``+Infinity``, ``-Infinity`` as floating point values (:issue:`10065`)
166165

167166
Plotting
168167
^^^^^^^^
@@ -172,12 +171,13 @@ Plotting
172171
- Bug in :meth:`DataFrame.plot` producing incorrect legend markers when plotting multiple series on the same axis (:issue:`18222`)
173172
- Bug in :meth:`DataFrame.plot` when ``kind='box'`` and data contains datetime or timedelta data. These types are now automatically dropped (:issue:`22799`)
174173
- Bug in :meth:`DataFrame.plot.line` and :meth:`DataFrame.plot.area` produce wrong xlim in x-axis (:issue:`27686`, :issue:`25160`, :issue:`24784`)
174+
- :func:`set_option` now validates that the plot backend provided to ``'plotting.backend'`` implements the backend when the option is set, rather than when a plot is created (:issue:`28163`)
175175

176176
Groupby/resample/rolling
177177
^^^^^^^^^^^^^^^^^^^^^^^^
178178

179179
-
180-
-
180+
- Bug in :meth:`DataFrame.rolling` not allowing for rolling over datetimes when ``axis=1`` (:issue: `28192`)
181181
- Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`)
182182
- Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`)
183183

@@ -190,7 +190,7 @@ Reshaping
190190

191191
Sparse
192192
^^^^^^
193-
193+
- Bug in :class:`SparseDataFrame` arithmetic operations incorrectly casting inputs to float (:issue:`28107`)
194194
-
195195
-
196196

environment.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,4 +80,5 @@ dependencies:
8080
- xlrd # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
8181
- xlsxwriter # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
8282
- xlwt # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
83+
- odfpy # pandas.read_excel
8384
- pyreadstat # pandas.read_spss

pandas/_libs/index.pyx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from datetime import datetime, timedelta, date
2+
import warnings
23

34
import cython
45

pandas/_libs/index_class_helper.pxi.in

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,16 @@ cdef class {{name}}Engine(IndexEngine):
6060

6161
# A view is needed for some subclasses, such as PeriodEngine:
6262
values = self._get_index_values().view('{{dtype}}')
63-
indexer = values == val
63+
try:
64+
with warnings.catch_warnings():
65+
# e.g. if values is float64 and `val` is a str, suppress warning
66+
warnings.filterwarnings("ignore", category=FutureWarning)
67+
indexer = values == val
68+
except TypeError:
69+
# if the equality above returns a bool, cython will raise TypeError
70+
# when trying to cast it to ndarray
71+
raise KeyError(val)
72+
6473
found = np.where(indexer)[0]
6574
count = len(found)
6675

pandas/_libs/lib.pyx

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ def fast_unique_multiple(list arrays, sort: bool=True):
235235
if sort is None:
236236
try:
237237
uniques.sort()
238-
except Exception:
238+
except TypeError:
239239
# TODO: RuntimeWarning?
240240
pass
241241

@@ -264,7 +264,7 @@ def fast_unique_multiple_list(lists: list, sort: bool=True) -> list:
264264
if sort:
265265
try:
266266
uniques.sort()
267-
except Exception:
267+
except TypeError:
268268
pass
269269

270270
return uniques
@@ -304,7 +304,7 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True):
304304
if sort:
305305
try:
306306
uniques.sort()
307-
except Exception:
307+
except TypeError:
308308
pass
309309

310310
return uniques
@@ -1410,7 +1410,7 @@ def infer_datetimelike_array(arr: object) -> object:
14101410
try:
14111411
array_to_datetime(objs, errors='raise')
14121412
return 'datetime'
1413-
except:
1413+
except (ValueError, TypeError):
14141414
pass
14151415

14161416
# we are *not* going to infer from strings

0 commit comments

Comments
 (0)