Skip to content

Commit 3ffd35c

Browse files
committed
Merge remote-tracking branch 'upstream/master' into timedelta_rounding
2 parents d86f26b + 0c4e611 commit 3ffd35c

File tree

262 files changed

+20990
-10608
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

262 files changed

+20990
-10608
lines changed

.github/PULL_REQUEST_TEMPLATE.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,27 @@
1+
Checklist for the pandas documentation sprint (ignore this if you are doing
2+
an unrelated PR):
3+
4+
- [ ] PR title is "DOC: update the <your-function-or-method> docstring"
5+
- [ ] The validation script passes: `scripts/validate_docstrings.py <your-function-or-method>`
6+
- [ ] The PEP8 style check passes: `git diff upstream/master -u -- "*.py" | flake8 --diff`
7+
- [ ] The html version looks good: `python doc/make.py --single <your-function-or-method>`
8+
- [ ] It has been proofread on language by another sprint participant
9+
10+
Please include the output of the validation script below between the "```" ticks:
11+
12+
```
13+
# paste output of "scripts/validate_docstrings.py <your-function-or-method>" here
14+
# between the "```" (remove this comment, but keep the "```")
15+
16+
```
17+
18+
If the validation script still gives errors, but you think there is a good reason
19+
to deviate in this case (and there are certainly such cases), please state this
20+
explicitly.
21+
22+
23+
Checklist for other PRs (remove this part if you are doing a PR for the pandas documentation sprint):
24+
125
- [ ] closes #xxxx
226
- [ ] tests added / passed
327
- [ ] passes `git diff upstream/master -u -- "*.py" | flake8 --diff`

.gitignore

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ dist
6161
.coverage
6262
coverage.xml
6363
coverage_html_report
64+
*.pytest_cache
6465

6566
# OS generated files #
6667
######################
@@ -88,8 +89,8 @@ scikits
8889
*.c
8990
*.cpp
9091

91-
# Performance Testing #
92-
#######################
92+
# Unit / Performance Testing #
93+
##############################
9394
asv_bench/env/
9495
asv_bench/html/
9596
asv_bench/results/
@@ -108,3 +109,4 @@ doc/tmp.sv
108109
doc/source/styled.xlsx
109110
doc/source/templates/
110111
env/
112+
doc/source/savefig/

asv_bench/benchmarks/groupby.py

Lines changed: 58 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@
1414
method_blacklist = {
1515
'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean',
1616
'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min',
17-
'var', 'mad', 'describe', 'std'}
17+
'var', 'mad', 'describe', 'std'},
18+
'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew',
19+
'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe',
20+
'std'}
1821
}
1922

2023

@@ -90,45 +93,6 @@ def time_series_groups(self, data, key):
9093
self.ser.groupby(self.ser).groups
9194

9295

93-
class FirstLast(object):
94-
95-
goal_time = 0.2
96-
97-
param_names = ['dtype']
98-
params = ['float32', 'float64', 'datetime', 'object']
99-
100-
def setup(self, dtype):
101-
N = 10**5
102-
# with datetimes (GH7555)
103-
if dtype == 'datetime':
104-
self.df = DataFrame({'values': date_range('1/1/2011',
105-
periods=N,
106-
freq='s'),
107-
'key': range(N)})
108-
elif dtype == 'object':
109-
self.df = DataFrame({'values': ['foo'] * N,
110-
'key': range(N)})
111-
else:
112-
labels = np.arange(N / 10).repeat(10)
113-
data = Series(np.random.randn(len(labels)), dtype=dtype)
114-
data[::3] = np.nan
115-
data[1::3] = np.nan
116-
labels = labels.take(np.random.permutation(len(labels)))
117-
self.df = DataFrame({'values': data, 'key': labels})
118-
119-
def time_groupby_first(self, dtype):
120-
self.df.groupby('key').first()
121-
122-
def time_groupby_last(self, dtype):
123-
self.df.groupby('key').last()
124-
125-
def time_groupby_nth_all(self, dtype):
126-
self.df.groupby('key').nth(0, dropna='all')
127-
128-
def time_groupby_nth_none(self, dtype):
129-
self.df.groupby('key').nth(0)
130-
131-
13296
class GroupManyLabels(object):
13397

13498
goal_time = 0.2
@@ -149,39 +113,40 @@ class Nth(object):
149113

150114
goal_time = 0.2
151115

152-
def setup_cache(self):
153-
df = DataFrame(np.random.randint(1, 100, (10000, 2)))
154-
df.iloc[1, 1] = np.nan
155-
return df
156-
157-
def time_frame_nth_any(self, df):
158-
df.groupby(0).nth(0, dropna='any')
159-
160-
def time_frame_nth(self, df):
161-
df.groupby(0).nth(0)
162-
116+
param_names = ['dtype']
117+
params = ['float32', 'float64', 'datetime', 'object']
163118

164-
def time_series_nth_any(self, df):
165-
df[1].groupby(df[0]).nth(0, dropna='any')
119+
def setup(self, dtype):
120+
N = 10**5
121+
# with datetimes (GH7555)
122+
if dtype == 'datetime':
123+
values = date_range('1/1/2011', periods=N, freq='s')
124+
elif dtype == 'object':
125+
values = ['foo'] * N
126+
else:
127+
values = np.arange(N).astype(dtype)
166128

167-
def time_series_nth(self, df):
168-
df[1].groupby(df[0]).nth(0)
129+
key = np.arange(N)
130+
self.df = DataFrame({'key': key, 'values': values})
131+
self.df.iloc[1, 1] = np.nan # insert missing data
169132

133+
def time_frame_nth_any(self, dtype):
134+
self.df.groupby('key').nth(0, dropna='any')
170135

171-
class NthObject(object):
136+
def time_groupby_nth_all(self, dtype):
137+
self.df.groupby('key').nth(0, dropna='all')
172138

173-
goal_time = 0.2
139+
def time_frame_nth(self, dtype):
140+
self.df.groupby('key').nth(0)
174141

175-
def setup_cache(self):
176-
df = DataFrame(np.random.randint(1, 100, (10000,)), columns=['g'])
177-
df['obj'] = ['a'] * 5000 + ['b'] * 5000
178-
return df
142+
def time_series_nth_any(self, dtype):
143+
self.df['values'].groupby(self.df['key']).nth(0, dropna='any')
179144

180-
def time_nth(self, df):
181-
df.groupby('g').nth(5)
145+
def time_groupby_nth_all(self, dtype):
146+
self.df['values'].groupby(self.df['key']).nth(0, dropna='all')
182147

183-
def time_nth_last(self, df):
184-
df.groupby('g').last()
148+
def time_series_nth(self, dtype):
149+
self.df['values'].groupby(self.df['key']).nth(0)
185150

186151

187152
class DateAttributes(object):
@@ -243,7 +208,7 @@ def time_multi_count(self, df):
243208
df.groupby(['key1', 'key2']).count()
244209

245210

246-
class CountInt(object):
211+
class CountMultiInt(object):
247212

248213
goal_time = 0.2
249214

@@ -255,18 +220,18 @@ def setup_cache(self):
255220
'ints2': np.random.randint(0, 1000, size=n)})
256221
return df
257222

258-
def time_int_count(self, df):
223+
def time_multi_int_count(self, df):
259224
df.groupby(['key1', 'key2']).count()
260225

261-
def time_int_nunique(self, df):
226+
def time_multi_int_nunique(self, df):
262227
df.groupby(['key1', 'key2']).nunique()
263228

264229

265230
class AggFunctions(object):
266231

267232
goal_time = 0.2
268233

269-
def setup_cache(self):
234+
def setup_cache():
270235
N = 10**5
271236
fac1 = np.array(['A', 'B', 'C'], dtype='O')
272237
fac2 = np.array(['one', 'two'], dtype='O')
@@ -361,9 +326,6 @@ def setup(self):
361326
def time_multi_size(self):
362327
self.df.groupby(['key1', 'key2']).size()
363328

364-
def time_dt_size(self):
365-
self.df.groupby(['dates']).size()
366-
367329
def time_dt_timegrouper_size(self):
368330
with warnings.catch_warnings(record=True):
369331
self.df.groupby(TimeGrouper(key='dates', freq='M')).size()
@@ -376,15 +338,16 @@ class GroupByMethods(object):
376338

377339
goal_time = 0.2
378340

379-
param_names = ['dtype', 'method']
380-
params = [['int', 'float', 'object'],
341+
param_names = ['dtype', 'method', 'application']
342+
params = [['int', 'float', 'object', 'datetime'],
381343
['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin',
382344
'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head',
383345
'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique',
384346
'pct_change', 'prod', 'rank', 'sem', 'shift', 'size', 'skew',
385-
'std', 'sum', 'tail', 'unique', 'value_counts', 'var']]
347+
'std', 'sum', 'tail', 'unique', 'value_counts', 'var'],
348+
['direct', 'transformation']]
386349

387-
def setup(self, dtype, method):
350+
def setup(self, dtype, method, application):
388351
if method in method_blacklist.get(dtype, {}):
389352
raise NotImplementedError # skip benchmark
390353
ngroups = 1000
@@ -398,12 +361,28 @@ def setup(self, dtype, method):
398361
np.random.random(ngroups) * 10.0])
399362
elif dtype == 'object':
400363
key = ['foo'] * size
364+
elif dtype == 'datetime':
365+
key = date_range('1/1/2011', periods=size, freq='s')
401366

402367
df = DataFrame({'values': values, 'key': key})
403-
self.df_groupby_method = getattr(df.groupby('key')['values'], method)
404368

405-
def time_method(self, dtype, method):
406-
self.df_groupby_method()
369+
if application == 'transform':
370+
if method == 'describe':
371+
raise NotImplementedError
372+
373+
self.as_group_method = lambda: df.groupby(
374+
'key')['values'].transform(method)
375+
self.as_field_method = lambda: df.groupby(
376+
'values')['key'].transform(method)
377+
else:
378+
self.as_group_method = getattr(df.groupby('key')['values'], method)
379+
self.as_field_method = getattr(df.groupby('values')['key'], method)
380+
381+
def time_dtype_as_group(self, dtype, method, application):
382+
self.as_group_method()
383+
384+
def time_dtype_as_field(self, dtype, method, application):
385+
self.as_field_method()
407386

408387

409388
class Float32(object):

asv_bench/benchmarks/io/csv.py

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -118,38 +118,6 @@ def time_read_uint64_na_values(self):
118118
na_values=self.na_values)
119119

120120

121-
class S3(object):
122-
# Make sure that we can read part of a file from S3 without
123-
# needing to download the entire thing. Use the timeit.default_timer
124-
# to measure wall time instead of CPU time -- we want to see
125-
# how long it takes to download the data.
126-
timer = timeit.default_timer
127-
params = ([None, "gzip", "bz2"], ["python", "c"])
128-
param_names = ["compression", "engine"]
129-
130-
def setup(self, compression, engine):
131-
if compression == "bz2" and engine == "c" and PY2:
132-
# The Python 2 C parser can't read bz2 from open files.
133-
raise NotImplementedError
134-
try:
135-
import s3fs # noqa
136-
except ImportError:
137-
# Skip these benchmarks if `boto` is not installed.
138-
raise NotImplementedError
139-
140-
ext = ""
141-
if compression == "gzip":
142-
ext = ".gz"
143-
elif compression == "bz2":
144-
ext = ".bz2"
145-
self.big_fname = "s3://pandas-test/large_random.csv" + ext
146-
147-
def time_read_csv_10_rows(self, compression, engine):
148-
# Read a small number of rows from a huge (100,000 x 50) table.
149-
read_csv(self.big_fname, nrows=10, compression=compression,
150-
engine=engine)
151-
152-
153121
class ReadCSVThousands(BaseIO):
154122

155123
goal_time = 0.2

asv_bench/benchmarks/timeseries.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,7 @@ def setup(self):
7575
freq='S'))
7676

7777
def time_infer_dst(self):
78-
with warnings.catch_warnings(record=True):
79-
self.index.tz_localize('US/Eastern', infer_dst=True)
78+
self.index.tz_localize('US/Eastern', ambiguous='infer')
8079

8180

8281
class ResetIndex(object):

ci/build_docs.sh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,11 @@ echo "inside $0"
1010

1111
git show --pretty="format:" --name-only HEAD~5.. --first-parent | grep -P "rst|txt|doc"
1212

13-
if [ "$?" != "0" ]; then
14-
echo "Skipping doc build, none were modified"
15-
# nope, skip docs build
16-
exit 0
17-
fi
13+
# if [ "$?" != "0" ]; then
14+
# echo "Skipping doc build, none were modified"
15+
# # nope, skip docs build
16+
# exit 0
17+
# fi
1818

1919

2020
if [ "$DOC" ]; then

ci/environment-dev.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ channels:
55
dependencies:
66
- Cython
77
- NumPy
8+
- flake8
89
- moto
910
- pytest>=3.1
1011
- python-dateutil>=2.5.0

ci/requirements-2.7_COMPAT.pip

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
html5lib==1.0b2
2-
beautifulsoup4==4.2.0
2+
beautifulsoup4==4.2.1
33
openpyxl
44
argparse

ci/requirements-3.6_DOC.run

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ sphinx
55
nbconvert
66
nbformat
77
notebook
8-
matplotlib
8+
matplotlib=2.1*
99
seaborn
1010
scipy
1111
lxml

ci/requirements-3.6_NUMPY_DEV.build.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,7 @@ PRE_WHEELS="https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf
1212
pip install --pre --upgrade --timeout=60 -f $PRE_WHEELS numpy scipy
1313

1414
# install dateutil from master
15-
# pip install -U git+git://github.com/dateutil/dateutil.git
16-
pip install dateutil
15+
pip install -U git+git://github.com/dateutil/dateutil.git
1716

1817
# cython via pip
1918
pip install cython

ci/requirements-optional-conda.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
beautifulsoup4
1+
beautifulsoup4>=4.2.1
22
blosc
33
bottleneck
44
fastparquet

ci/requirements-optional-pip.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# This file was autogenerated by scripts/convert_deps.py
22
# Do not modify directly
3-
beautifulsoup4
3+
beautifulsoup4>=4.2.1
44
blosc
55
bottleneck
66
fastparquet

ci/requirements_dev.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22
# Do not modify directly
33
Cython
44
NumPy
5+
flake8
56
moto
67
pytest>=3.1
78
python-dateutil>=2.5.0
89
pytz
910
setuptools>=3.3
10-
sphinx
11+
sphinx

doc/cheatsheet/Pandas_Cheat_Sheet.pdf

160 KB
Binary file not shown.
-71.3 KB
Binary file not shown.

0 commit comments

Comments
 (0)