Skip to content

Commit 37915e9

Browse files
committed
Merge remote-tracking branch 'upstream/master' into ea-take
2 parents 67ba9dd + 60fe82c commit 37915e9

File tree

10 files changed

+211
-32
lines changed

10 files changed

+211
-32
lines changed

asv_bench/benchmarks/categoricals.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,3 +148,24 @@ def time_rank_int_cat(self):
148148

149149
def time_rank_int_cat_ordered(self):
150150
self.s_int_cat_ordered.rank()
151+
152+
153+
class Isin(object):
154+
155+
goal_time = 0.2
156+
157+
params = ['object', 'int64']
158+
param_names = ['dtype']
159+
160+
def setup(self, dtype):
161+
np.random.seed(1234)
162+
n = 5 * 10**5
163+
sample_size = 100
164+
arr = [i for i in np.random.randint(0, n // 10, size=n)]
165+
if dtype == 'object':
166+
arr = ['s%04d' % i for i in arr]
167+
self.sample = np.random.choice(arr, sample_size)
168+
self.series = pd.Series(arr).astype('category')
169+
170+
def time_isin_categorical(self, dtype):
171+
self.series.isin(self.sample)

doc/source/indexing.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1773,7 +1773,7 @@ These both yield the same results, so which should you use? It is instructive to
17731773
of operations on these and why method 2 (``.loc``) is much preferred over method 1 (chained ``[]``).
17741774

17751775
``dfmi['one']`` selects the first level of the columns and returns a DataFrame that is singly-indexed.
1776-
Then another Python operation ``dfmi_with_one['second']`` selects the series indexed by ``'second'`` happens.
1776+
Then another Python operation ``dfmi_with_one['second']`` selects the series indexed by ``'second'``.
17771777
This is indicated by the variable ``dfmi_with_one`` because pandas sees these operations as separate events.
17781778
e.g. separate calls to ``__getitem__``, so it has to treat them as linear operations, they happen one after another.
17791779

doc/source/whatsnew/v0.23.0.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -954,6 +954,7 @@ Performance Improvements
954954
- Improved performance of :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` (:issue:`11296`)
955955
- Improved performance of :func:`pandas.core.groupby.GroupBy.any` and :func:`pandas.core.groupby.GroupBy.all` (:issue:`15435`)
956956
- Improved performance of :func:`pandas.core.groupby.GroupBy.pct_change` (:issue:`19165`)
957+
- Improved performance of :func:`Series.isin` in the case of categorical dtypes (:issue:`20003`)
957958
- Fixed a performance regression for :func:`GroupBy.nth` and :func:`GroupBy.last` with some object columns (:issue:`19283`)
958959

959960
.. _whatsnew_0230.docs:
@@ -1156,6 +1157,10 @@ I/O
11561157
- Bug in :func:`read_csv` causing heap corruption on 32-bit, big-endian architectures (:issue:`20785`)
11571158
- Bug in :func:`read_sas` where a file with 0 variables gave an ``AttributeError`` incorrectly. Now it gives an ``EmptyDataError`` (:issue:`18184`)
11581159
- Bug in :func:`DataFrame.to_latex()` where pairs of braces meant to serve as invisible placeholders were escaped (:issue:`18667`)
1160+
- Bug in :func:`DataFrame.to_latex()` where a ``NaN`` in a ``MultiIndex`` would cause an ``IndexError`` or incorrect output (:issue:`14249`)
1161+
- Bug in :func:`DataFrame.to_latex()` where a non-string index-level name would result in an ``AttributeError`` (:issue:`19981`)
1162+
- Bug in :func:`DataFrame.to_latex()` where the combination of an index name and the `index_names=False` option would result in incorrect output (:issue:`18326`)
1163+
- Bug in :func:`DataFrame.to_latex()` where a ``MultiIndex`` with an empty string as its name would result in incorrect output (:issue:`18669`)
11591164
- Bug in :func:`read_json` where large numeric values were causing an ``OverflowError`` (:issue:`18842`)
11601165
- Bug in :func:`DataFrame.to_parquet` where an exception was raised if the write destination is S3 (:issue:`19134`)
11611166
- :class:`Interval` now supported in :func:`DataFrame.to_excel` for all Excel file types (:issue:`19242`)

pandas/core/algorithms.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -408,6 +408,13 @@ def isin(comps, values):
408408
if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)):
409409
values = construct_1d_object_array_from_listlike(list(values))
410410

411+
if is_categorical_dtype(comps):
412+
# TODO(extension)
413+
# handle categoricals
414+
return comps._values.isin(values)
415+
416+
comps = com._values_from_object(comps)
417+
411418
comps, dtype, _ = _ensure_data(comps)
412419
values, _, _ = _ensure_data(values, dtype=dtype)
413420

pandas/core/arrays/categorical.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@
3939
from pandas.util._decorators import (
4040
Appender, cache_readonly, deprecate_kwarg, Substitution)
4141

42+
import pandas.core.algorithms as algorithms
43+
4244
from pandas.io.formats.terminal import get_terminal_size
4345
from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs
4446
from pandas.core.config import get_option
@@ -2216,6 +2218,60 @@ def _concat_same_type(self, to_concat):
22162218
def _formatting_values(self):
22172219
return self
22182220

2221+
def isin(self, values):
2222+
"""
2223+
Check whether `values` are contained in Categorical.
2224+
2225+
Return a boolean NumPy Array showing whether each element in
2226+
the Categorical matches an element in the passed sequence of
2227+
`values` exactly.
2228+
2229+
Parameters
2230+
----------
2231+
values : set or list-like
2232+
The sequence of values to test. Passing in a single string will
2233+
raise a ``TypeError``. Instead, turn a single string into a
2234+
list of one element.
2235+
2236+
Returns
2237+
-------
2238+
isin : numpy.ndarray (bool dtype)
2239+
2240+
Raises
2241+
------
2242+
TypeError
2243+
* If `values` is not a set or list-like
2244+
2245+
See Also
2246+
--------
2247+
pandas.Series.isin : equivalent method on Series
2248+
2249+
Examples
2250+
--------
2251+
2252+
>>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama',
2253+
... 'hippo'])
2254+
>>> s.isin(['cow', 'lama'])
2255+
array([ True, True, True, False, True, False])
2256+
2257+
Passing a single string as ``s.isin('lama')`` will raise an error. Use
2258+
a list of one element instead:
2259+
2260+
>>> s.isin(['lama'])
2261+
array([ True, False, True, False, True, False])
2262+
"""
2263+
from pandas.core.series import _sanitize_array
2264+
if not is_list_like(values):
2265+
raise TypeError("only list-like objects are allowed to be passed"
2266+
" to isin(), you passed a [{values_type}]"
2267+
.format(values_type=type(values).__name__))
2268+
values = _sanitize_array(values, None, None)
2269+
null_mask = np.asarray(isna(values))
2270+
code_values = self.categories.get_indexer(values)
2271+
code_values = code_values[null_mask | (code_values >= 0)]
2272+
return algorithms.isin(self.codes, code_values)
2273+
2274+
22192275
# The Series.cat accessor
22202276

22212277

pandas/core/indexes/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3516,7 +3516,7 @@ def isin(self, values, level=None):
35163516
"""
35173517
if level is not None:
35183518
self._validate_index_level(level)
3519-
return algos.isin(np.array(self), values)
3519+
return algos.isin(self, values)
35203520

35213521
def _can_reindex(self, indexer):
35223522
"""

pandas/core/series.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3570,7 +3570,7 @@ def isin(self, values):
35703570
5 False
35713571
Name: animal, dtype: bool
35723572
"""
3573-
result = algorithms.isin(com._values_from_object(self), values)
3573+
result = algorithms.isin(self, values)
35743574
return self._constructor(result, index=self.index).__finalize__(self)
35753575

35763576
def between(self, left, right, inclusive=True):

pandas/io/formats/latex.py

Lines changed: 26 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -64,35 +64,32 @@ def get_col_type(dtype):
6464

6565
# reestablish the MultiIndex that has been joined by _to_str_column
6666
if self.fmt.index and isinstance(self.frame.index, MultiIndex):
67+
out = self.frame.index.format(
68+
adjoin=False, sparsify=self.fmt.sparsify,
69+
names=self.fmt.has_index_names, na_rep=self.fmt.na_rep
70+
)
71+
72+
# index.format will sparsify repeated entries with empty strings
73+
# so pad these with some empty space
74+
def pad_empties(x):
75+
for pad in reversed(x):
76+
if pad:
77+
break
78+
return [x[0]] + [i if i else ' ' * len(pad) for i in x[1:]]
79+
out = (pad_empties(i) for i in out)
80+
81+
# Add empty spaces for each column level
6782
clevels = self.frame.columns.nlevels
68-
strcols.pop(0)
69-
name = any(self.frame.index.names)
70-
cname = any(self.frame.columns.names)
71-
lastcol = self.frame.index.nlevels - 1
72-
previous_lev3 = None
73-
for i, lev in enumerate(self.frame.index.levels):
74-
lev2 = lev.format()
75-
blank = ' ' * len(lev2[0])
76-
# display column names in last index-column
77-
if cname and i == lastcol:
78-
lev3 = [x if x else '{}' for x in self.frame.columns.names]
79-
else:
80-
lev3 = [blank] * clevels
81-
if name:
82-
lev3.append(lev.name)
83-
current_idx_val = None
84-
for level_idx in self.frame.index.labels[i]:
85-
if ((previous_lev3 is None or
86-
previous_lev3[len(lev3)].isspace()) and
87-
lev2[level_idx] == current_idx_val):
88-
# same index as above row and left index was the same
89-
lev3.append(blank)
90-
else:
91-
# different value than above or left index different
92-
lev3.append(lev2[level_idx])
93-
current_idx_val = lev2[level_idx]
94-
strcols.insert(i, lev3)
95-
previous_lev3 = lev3
83+
out = [[' ' * len(i[-1])] * clevels + i for i in out]
84+
85+
# Add the column names to the last index column
86+
cnames = self.frame.columns.names
87+
if any(cnames):
88+
new_names = [i if i else '{}' for i in cnames]
89+
out[self.frame.index.nlevels - 1][:clevels] = new_names
90+
91+
# Get rid of old multiindex column and add new ones
92+
strcols = out + strcols[1:]
9693

9794
column_format = self.column_format
9895
if column_format is None:
@@ -118,7 +115,7 @@ def get_col_type(dtype):
118115
ilevels = self.frame.index.nlevels
119116
clevels = self.frame.columns.nlevels
120117
nlevels = clevels
121-
if any(self.frame.index.names):
118+
if self.fmt.has_index_names and self.fmt.show_index_names:
122119
nlevels += 1
123120
strrows = list(zip(*strcols))
124121
self.clinebuf = []

pandas/tests/categorical/test_algos.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,25 @@ def test_factorized_sort_ordered():
4747

4848
tm.assert_numpy_array_equal(labels, expected_labels)
4949
tm.assert_categorical_equal(uniques, expected_uniques)
50+
51+
52+
def test_isin_cats():
53+
# GH2003
54+
cat = pd.Categorical(["a", "b", np.nan])
55+
56+
result = cat.isin(["a", np.nan])
57+
expected = np.array([True, False, True], dtype=bool)
58+
tm.assert_numpy_array_equal(expected, result)
59+
60+
result = cat.isin(["a", "c"])
61+
expected = np.array([True, False, False], dtype=bool)
62+
tm.assert_numpy_array_equal(expected, result)
63+
64+
65+
@pytest.mark.parametrize("empty", [[], pd.Series(), np.array([])])
66+
def test_isin_empty(empty):
67+
s = pd.Categorical(["a", "b"])
68+
expected = np.array([False, False], dtype=bool)
69+
70+
result = s.isin(empty)
71+
tm.assert_numpy_array_equal(expected, result)

pandas/tests/io/formats/test_to_latex.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -621,3 +621,74 @@ def test_to_latex_multiindex_names(self, name0, name1, axes):
621621
\end{tabular}
622622
""" % tuple(list(col_names) + [idx_names_row])
623623
assert observed == expected
624+
625+
@pytest.mark.parametrize('one_row', [True, False])
626+
def test_to_latex_multiindex_nans(self, one_row):
627+
# GH 14249
628+
df = pd.DataFrame({'a': [None, 1], 'b': [2, 3], 'c': [4, 5]})
629+
if one_row:
630+
df = df.iloc[[0]]
631+
observed = df.set_index(['a', 'b']).to_latex()
632+
expected = r"""\begin{tabular}{llr}
633+
\toprule
634+
& & c \\
635+
a & b & \\
636+
\midrule
637+
NaN & 2 & 4 \\
638+
"""
639+
if not one_row:
640+
expected += r"""1.0 & 3 & 5 \\
641+
"""
642+
expected += r"""\bottomrule
643+
\end{tabular}
644+
"""
645+
assert observed == expected
646+
647+
def test_to_latex_non_string_index(self):
648+
# GH 19981
649+
observed = pd.DataFrame([[1, 2, 3]] * 2).set_index([0, 1]).to_latex()
650+
expected = r"""\begin{tabular}{llr}
651+
\toprule
652+
& & 2 \\
653+
0 & 1 & \\
654+
\midrule
655+
1 & 2 & 3 \\
656+
& 2 & 3 \\
657+
\bottomrule
658+
\end{tabular}
659+
"""
660+
assert observed == expected
661+
662+
def test_to_latex_midrule_location(self):
663+
# GH 18326
664+
df = pd.DataFrame({'a': [1, 2]})
665+
df.index.name = 'foo'
666+
observed = df.to_latex(index_names=False)
667+
expected = r"""\begin{tabular}{lr}
668+
\toprule
669+
{} & a \\
670+
\midrule
671+
0 & 1 \\
672+
1 & 2 \\
673+
\bottomrule
674+
\end{tabular}
675+
"""
676+
677+
assert observed == expected
678+
679+
def test_to_latex_multiindex_empty_name(self):
680+
# GH 18669
681+
mi = pd.MultiIndex.from_product([[1, 2]], names=[''])
682+
df = pd.DataFrame(-1, index=mi, columns=range(4))
683+
observed = df.to_latex()
684+
expected = r"""\begin{tabular}{lrrrr}
685+
\toprule
686+
& 0 & 1 & 2 & 3 \\
687+
{} & & & & \\
688+
\midrule
689+
1 & -1 & -1 & -1 & -1 \\
690+
2 & -1 & -1 & -1 & -1 \\
691+
\bottomrule
692+
\end{tabular}
693+
"""
694+
assert observed == expected

0 commit comments

Comments
 (0)