Skip to content

Commit 5ac40ff

Browse files
committed
Merge remote-tracking branch 'upstream/master' into multi-index-join
2 parents f0ac24d + d78bd7a commit 5ac40ff

29 files changed

+323
-178
lines changed

doc/source/conf.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@
9999
# JP: added from sphinxdocs
100100
autosummary_generate = False
101101

102-
if any(re.match("\s*api\s*", l) for l in index_rst_lines):
102+
if any(re.match(r"\s*api\s*", l) for l in index_rst_lines):
103103
autosummary_generate = True
104104

105105
# numpydoc
@@ -341,8 +341,8 @@
341341
# file, target name, title, author, documentclass [howto/manual]).
342342
latex_documents = [
343343
('index', 'pandas.tex',
344-
u'pandas: powerful Python data analysis toolkit',
345-
u'Wes McKinney\n\& PyData Development Team', 'manual'),
344+
'pandas: powerful Python data analysis toolkit',
345+
r'Wes McKinney\n\& PyData Development Team', 'manual'),
346346
]
347347

348348
# The name of an image file (relative to this directory) to place at the top of

doc/source/cookbook.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1226,6 +1226,17 @@ Computation
12261226
Correlation
12271227
***********
12281228

1229+
Often it's useful to obtain the lower (or upper) triangular form of a correlation matrix calculated from :func:`DataFrame.corr`. This can be achieved by passing a boolean mask to ``where`` as follows:
1230+
1231+
.. ipython:: python
1232+
1233+
df = pd.DataFrame(np.random.random(size=(100, 5)))
1234+
1235+
corr_mat = df.corr()
1236+
mask = np.tril(np.ones_like(corr_mat, dtype=np.bool), k=-1)
1237+
1238+
corr_mat.where(mask)
1239+
12291240
The `method` argument within `DataFrame.corr` can accept a callable in addition to the named correlation types. Here we compute the `distance correlation <https://en.wikipedia.org/wiki/Distance_correlation>`__ matrix for a `DataFrame` object.
12301241

12311242
.. code-block:: python

doc/source/groupby.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,16 @@ We could naturally group by either the ``A`` or ``B`` columns, or both:
125125
grouped = df.groupby('A')
126126
grouped = df.groupby(['A', 'B'])
127127
128+
.. versionadded:: 0.24
129+
130+
If we also have a MultiIndex on columns ``A`` and ``B``, we can group by all
131+
but the specified columns
132+
133+
.. ipython:: python
134+
135+
df2 = df.set_index(['A', 'B'])
136+
grouped = df2.groupby(level=df2.index.names.difference(['B'])
137+
128138
These will split the DataFrame on its index (rows). We could also split by the
129139
columns:
130140

doc/source/whatsnew/v0.24.0.txt

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,9 @@ v0.24.0 (Month XX, 2018)
1313
New features
1414
~~~~~~~~~~~~
1515
- :func:`merge` now directly allows merge between objects of type ``DataFrame`` and named ``Series``, without the need to convert the ``Series`` object into a ``DataFrame`` beforehand (:issue:`21220`)
16-
17-
1816
- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`)
19-
17+
- ``FrozenList`` has gained the ``.union()`` and ``.difference()`` methods. This functionality greatly simplifies groupby's that rely on explicitly excluding certain columns. See :ref:`Splitting an object into groups
18+
<groupby.split>` for more information (:issue:`15475`, :issue:`15506`)
2019
- :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing
2120
the user to override the engine's default behavior to include or omit the
2221
dataframe's indexes from the resulting Parquet file. (:issue:`20768`)

pandas/_libs/algos.pyx

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ class NegInfinity(object):
7777
__ge__ = lambda self, other: isinstance(other, NegInfinity)
7878

7979

80+
@cython.wraparound(False)
81+
@cython.boundscheck(False)
8082
cpdef ndarray[int64_t, ndim=1] unique_deltas(ndarray[int64_t] arr):
8183
"""
8284
Efficiently find the unique first-differences of the given array.
@@ -793,7 +795,7 @@ arrmap_bool = arrmap["uint8_t"]
793795

794796
@cython.boundscheck(False)
795797
@cython.wraparound(False)
796-
def is_monotonic(ndarray[algos_t] arr, bint timelike):
798+
def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike):
797799
"""
798800
Returns
799801
-------

pandas/_libs/groupby.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,7 @@ def group_any_all(ndarray[uint8_t] out,
353353
The returned values will either be 0 or 1 (False or True, respectively).
354354
"""
355355
cdef:
356-
Py_ssize_t i, N=len(labels)
356+
Py_ssize_t i, N = len(labels)
357357
int64_t lab
358358
uint8_t flag_val
359359

pandas/_libs/groupby_helper.pxi.in

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -667,11 +667,6 @@ def group_max(ndarray[groupby_t, ndim=2] out,
667667
out[i, j] = maxx[i, j]
668668

669669

670-
group_max_float64 = group_max["float64_t"]
671-
group_max_float32 = group_max["float32_t"]
672-
group_max_int64 = group_max["int64_t"]
673-
674-
675670
@cython.wraparound(False)
676671
@cython.boundscheck(False)
677672
def group_min(ndarray[groupby_t, ndim=2] out,
@@ -734,11 +729,6 @@ def group_min(ndarray[groupby_t, ndim=2] out,
734729
out[i, j] = minx[i, j]
735730

736731

737-
group_min_float64 = group_min["float64_t"]
738-
group_min_float32 = group_min["float32_t"]
739-
group_min_int64 = group_min["int64_t"]
740-
741-
742732
@cython.boundscheck(False)
743733
@cython.wraparound(False)
744734
def group_cummin(ndarray[groupby_t, ndim=2] out,
@@ -787,11 +777,6 @@ def group_cummin(ndarray[groupby_t, ndim=2] out,
787777
out[i, j] = mval
788778

789779

790-
group_cummin_float64 = group_cummin["float64_t"]
791-
group_cummin_float32 = group_cummin["float32_t"]
792-
group_cummin_int64 = group_cummin["int64_t"]
793-
794-
795780
@cython.boundscheck(False)
796781
@cython.wraparound(False)
797782
def group_cummax(ndarray[groupby_t, ndim=2] out,
@@ -837,8 +822,3 @@ def group_cummax(ndarray[groupby_t, ndim=2] out,
837822
if val > mval:
838823
accum[lab, j] = mval = val
839824
out[i, j] = mval
840-
841-
842-
group_cummax_float64 = group_cummax["float64_t"]
843-
group_cummax_float32 = group_cummax["float32_t"]
844-
group_cummax_int64 = group_cummax["int64_t"]

pandas/_libs/hashtable_class_helper.pxi.in

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -86,12 +86,12 @@ cdef class {{name}}Vector:
8686
self.data.n = 0
8787
self.data.m = _INIT_VEC_CAP
8888
self.ao = np.empty(self.data.m, dtype={{idtype}})
89-
self.data.data = <{{arg}}*> self.ao.data
89+
self.data.data = <{{arg}}*>self.ao.data
9090

9191
cdef resize(self):
9292
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
9393
self.ao.resize(self.data.m, refcheck=False)
94-
self.data.data = <{{arg}}*> self.ao.data
94+
self.data.data = <{{arg}}*>self.ao.data
9595

9696
def __dealloc__(self):
9797
if self.data is not NULL:
@@ -140,7 +140,7 @@ cdef class StringVector:
140140
self.external_view_exists = False
141141
self.data.n = 0
142142
self.data.m = _INIT_VEC_CAP
143-
self.data.data = <char **> malloc(self.data.m * sizeof(char *))
143+
self.data.data = <char **>malloc(self.data.m * sizeof(char *))
144144
if not self.data.data:
145145
raise MemoryError()
146146

@@ -153,7 +153,7 @@ cdef class StringVector:
153153
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
154154

155155
orig_data = self.data.data
156-
self.data.data = <char **> malloc(self.data.m * sizeof(char *))
156+
self.data.data = <char **>malloc(self.data.m * sizeof(char *))
157157
if not self.data.data:
158158
raise MemoryError()
159159
for i in range(m):
@@ -208,22 +208,22 @@ cdef class ObjectVector:
208208
self.n = 0
209209
self.m = _INIT_VEC_CAP
210210
self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
211-
self.data = <PyObject**> self.ao.data
211+
self.data = <PyObject**>self.ao.data
212212

213213
def __len__(self):
214214
return self.n
215215

216-
cdef inline append(self, object o):
216+
cdef inline append(self, object obj):
217217
if self.n == self.m:
218218
if self.external_view_exists:
219219
raise ValueError("external reference but "
220220
"Vector.resize() needed")
221221
self.m = max(self.m * 2, _INIT_VEC_CAP)
222222
self.ao.resize(self.m, refcheck=False)
223-
self.data = <PyObject**> self.ao.data
223+
self.data = <PyObject**>self.ao.data
224224

225-
Py_INCREF(o)
226-
self.data[self.n] = <PyObject*> o
225+
Py_INCREF(obj)
226+
self.data[self.n] = <PyObject*>obj
227227
self.n += 1
228228

229229
def to_array(self):
@@ -768,7 +768,7 @@ cdef class StringHashTable(HashTable):
768768
use_na_value = na_value is not None
769769

770770
# assign pointers and pre-filter out missing
771-
vecs = <const char **> malloc(n * sizeof(char *))
771+
vecs = <const char **>malloc(n * sizeof(char *))
772772
for i in range(n):
773773
val = values[i]
774774

@@ -844,9 +844,9 @@ cdef class PyObjectHashTable(HashTable):
844844

845845
def sizeof(self, deep=False):
846846
""" return the size of my table in bytes """
847-
return self.table.n_buckets * (sizeof(PyObject *) + # keys
848-
sizeof(Py_ssize_t) + # vals
849-
sizeof(uint32_t)) # flags
847+
return self.table.n_buckets * (sizeof(PyObject *) + # keys
848+
sizeof(Py_ssize_t) + # vals
849+
sizeof(uint32_t)) # flags
850850

851851
cpdef get_item(self, object val):
852852
cdef khiter_t k

pandas/_libs/hashtable_func_helper.pxi.in

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,11 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values,
4545
val = values[i]
4646

4747
if not checknull(val) or not dropna:
48-
k = kh_get_{{ttype}}(table, <PyObject*> val)
48+
k = kh_get_{{ttype}}(table, <PyObject*>val)
4949
if k != table.n_buckets:
5050
table.vals[k] += 1
5151
else:
52-
k = kh_put_{{ttype}}(table, <PyObject*> val, &ret)
52+
k = kh_put_{{ttype}}(table, <PyObject*>val, &ret)
5353
table.vals[k] = 1
5454
{{else}}
5555
with nogil:
@@ -103,7 +103,7 @@ cpdef value_count_{{dtype}}({{scalar}}[:] values, bint dropna):
103103
{{if dtype == 'object'}}
104104
for k in range(table.n_buckets):
105105
if kh_exist_{{ttype}}(table, k):
106-
result_keys[i] = <{{dtype}}> table.keys[k]
106+
result_keys[i] = <{{dtype}}>table.keys[k]
107107
result_counts[i] = table.vals[k]
108108
i += 1
109109
{{else}}
@@ -152,7 +152,7 @@ def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'):
152152
if keep == 'last':
153153
{{if dtype == 'object'}}
154154
for i from n > i >= 0:
155-
kh_put_{{ttype}}(table, <PyObject*> values[i], &ret)
155+
kh_put_{{ttype}}(table, <PyObject*>values[i], &ret)
156156
out[i] = ret == 0
157157
{{else}}
158158
with nogil:
@@ -163,7 +163,7 @@ def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'):
163163
elif keep == 'first':
164164
{{if dtype == 'object'}}
165165
for i in range(n):
166-
kh_put_{{ttype}}(table, <PyObject*> values[i], &ret)
166+
kh_put_{{ttype}}(table, <PyObject*>values[i], &ret)
167167
out[i] = ret == 0
168168
{{else}}
169169
with nogil:
@@ -175,13 +175,13 @@ def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'):
175175
{{if dtype == 'object'}}
176176
for i in range(n):
177177
value = values[i]
178-
k = kh_get_{{ttype}}(table, <PyObject*> value)
178+
k = kh_get_{{ttype}}(table, <PyObject*>value)
179179
if k != table.n_buckets:
180180
out[table.vals[k]] = 1
181181
out[i] = 1
182182
else:
183-
k = kh_put_{{ttype}}(table, <PyObject*> value, &ret)
184-
table.keys[k] = <PyObject*> value
183+
k = kh_put_{{ttype}}(table, <PyObject*>value, &ret)
184+
table.keys[k] = <PyObject*>value
185185
table.vals[k] = i
186186
out[i] = 0
187187
{{else}}
@@ -245,7 +245,7 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values):
245245

246246
{{if dtype == 'object'}}
247247
for i in range(n):
248-
kh_put_{{ttype}}(table, <PyObject*> values[i], &ret)
248+
kh_put_{{ttype}}(table, <PyObject*>values[i], &ret)
249249
{{else}}
250250
with nogil:
251251
for i in range(n):
@@ -259,7 +259,7 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values):
259259
{{if dtype == 'object'}}
260260
for i in range(n):
261261
val = arr[i]
262-
k = kh_get_{{ttype}}(table, <PyObject*> val)
262+
k = kh_get_{{ttype}}(table, <PyObject*>val)
263263
result[i] = (k != table.n_buckets)
264264
{{else}}
265265
with nogil:
@@ -342,7 +342,7 @@ def mode_{{dtype}}({{ctype}}[:] values, bint dropna):
342342
else:
343343
continue
344344

345-
modes[j] = <object> table.keys[k]
345+
modes[j] = <object>table.keys[k]
346346
{{endif}}
347347

348348
kh_destroy_{{table_type}}(table)

pandas/_libs/join.pyx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# -*- coding: utf-8 -*-
22

3-
cimport cython
4-
from cython cimport Py_ssize_t
3+
import cython
4+
from cython import Py_ssize_t
55

66
import numpy as np
77
cimport numpy as cnp

0 commit comments

Comments
 (0)