Skip to content

Commit 99c240c

Browse files
OXPHOSOXPHOS
OXPHOS
authored and
OXPHOS
committed
fix v2
1 parent db45f86 commit 99c240c

File tree

5 files changed

+39
-33
lines changed

5 files changed

+39
-33
lines changed

pandas/_libs/hashtable_class_helper.pxi.in

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -347,9 +347,10 @@ cdef class {{name}}HashTable(HashTable):
347347
for i in range(n):
348348
val = values[i]
349349

350-
if check_null and {{null_condition}}:
351-
labels[i] = na_sentinel
352-
continue
350+
if dropna:
351+
if check_null and {{null_condition}}:
352+
labels[i] = na_sentinel
353+
continue
353354

354355
k = kh_get_{{dtype}}(self.table, val)
355356

@@ -830,13 +831,10 @@ cdef class PyObjectHashTable(HashTable):
830831
val = values[i]
831832
hash(val)
832833

833-
if check_null and val != val:
834-
labels[i] = na_sentinel
835-
continue
836-
837-
if dropna and val is None:
838-
labels[i] = na_sentinel
839-
continue
834+
if dropna:
835+
if (check_null and val != val) or val is None:
836+
labels[i] = na_sentinel
837+
continue
840838

841839
k = kh_get_pymap(self.table, <PyObject*>val)
842840
if k != self.table.n_buckets:

pandas/core/algorithms.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -520,7 +520,7 @@ def sort_mixed(values):
520520

521521

522522
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None,
523-
dropna=False):
523+
dropna=True):
524524
"""
525525
Encode input values as an enumerated type or categorical variable
526526

pandas/core/categorical.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,8 @@ class Categorical(PandasObject):
234234
__array_priority__ = 1000
235235
_typ = 'categorical'
236236

237-
def __init__(self, values, categories=None, ordered=False, fastpath=False):
237+
def __init__(self, values, categories=None, ordered=False, fastpath=False,
238+
dropna=True):
238239

239240
self._validate_ordered(ordered)
240241

@@ -281,9 +282,10 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False):
281282

282283
if categories is None:
283284
try:
284-
codes, categories = factorize(values, sort=True)
285+
codes, categories = factorize(values, sort=True, dropna=dropna)
285286
except TypeError:
286-
codes, categories = factorize(values, sort=False)
287+
codes, categories = factorize(values, sort=False,
288+
dropna=dropna)
287289
if ordered:
288290
# raise, as we don't have a sortable data structure and so
289291
# the user should give us one by specifying categories
@@ -2106,7 +2108,7 @@ def _convert_to_list_like(list_like):
21062108
return [list_like]
21072109

21082110

2109-
def _factorize_from_iterable(values):
2111+
def _factorize_from_iterable(values, dropna=True):
21102112
"""
21112113
Factorize an input `values` into `categories` and `codes`. Preserves
21122114
categorical dtype in `categories`.
@@ -2137,13 +2139,13 @@ def _factorize_from_iterable(values):
21372139
ordered=values.ordered)
21382140
codes = values.codes
21392141
else:
2140-
cat = Categorical(values, ordered=True)
2142+
cat = Categorical(values, ordered=True, dropna=dropna)
21412143
categories = cat.categories
21422144
codes = cat.codes
21432145
return codes, categories
21442146

21452147

2146-
def _factorize_from_iterables(iterables):
2148+
def _factorize_from_iterables(iterables, dropna=True):
21472149
"""
21482150
A higher-level wrapper over `_factorize_from_iterable`.
21492151
@@ -2165,4 +2167,5 @@ def _factorize_from_iterables(iterables):
21652167
if len(iterables) == 0:
21662168
# For consistency, it should return a list of 2 lists.
21672169
return [[], []]
2168-
return map(list, lzip(*[_factorize_from_iterable(it) for it in iterables]))
2170+
return map(list, lzip(*[_factorize_from_iterable(it, dropna)
2171+
for it in iterables]))

pandas/core/indexes/multi.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1043,7 +1043,7 @@ def lexsort_depth(self):
10431043
return 0
10441044

10451045
@classmethod
1046-
def from_arrays(cls, arrays, sortorder=None, names=None):
1046+
def from_arrays(cls, arrays, sortorder=None, names=None, dropna=False):
10471047
"""
10481048
Convert arrays to MultiIndex
10491049
@@ -1083,7 +1083,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None):
10831083

10841084
from pandas.core.categorical import _factorize_from_iterables
10851085

1086-
labels, levels = _factorize_from_iterables(arrays)
1086+
labels, levels = _factorize_from_iterables(arrays, dropna=dropna)
10871087
if names is None:
10881088
names = [getattr(arr, "name", None) for arr in arrays]
10891089

pandas/tests/reshape/test_pivot.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1180,17 +1180,18 @@ def test_margin_dropna(self):
11801180
df = pd.DataFrame({'a': [1, 2, 2, 2, 2, np.nan],
11811181
'b': [3, 3, 4, 4, 4, 4]})
11821182
actual = pd.crosstab(df.a, df.b, margins=True, dropna=False)
1183-
expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 4, 6]])
1184-
expected.index = Index([1.0, 2.0, 'All'], name='a')
1183+
expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [0, 1, 1], [2, 4, 6]])
1184+
expected.index = Index([1.0, 2.0, np.nan, 'All'], name='a')
11851185
expected.columns = Index([3, 4, 'All'], name='b')
11861186
tm.assert_frame_equal(actual, expected)
11871187

11881188
df = DataFrame({'a': [1, np.nan, np.nan, np.nan, 2, np.nan],
11891189
'b': [3, np.nan, 4, 4, 4, 4]})
11901190
actual = pd.crosstab(df.a, df.b, margins=True, dropna=False)
1191-
expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 4, 6]])
1192-
expected.index = Index([1.0, 2.0, 'All'], name='a')
1193-
expected.columns = Index([3.0, 4.0, 'All'], name='b')
1191+
expected = pd.DataFrame([[1, 0, 0, 1], [0, 1, 0, 1], [0, 3, 1, 4],
1192+
[1, 4, 0, 6]])
1193+
expected.index = Index([1.0, 2.0, np.nan, 'All'], name='a')
1194+
expected.columns = Index([3.0, 4.0, np.nan, 'All'], name='b')
11941195
tm.assert_frame_equal(actual, expected)
11951196

11961197
a = np.array(['foo', 'foo', 'foo', 'bar',
@@ -1202,21 +1203,25 @@ def test_margin_dropna(self):
12021203

12031204
actual = pd.crosstab(a, [b, c], rownames=['a'],
12041205
colnames=['b', 'c'], margins=True, dropna=False)
1205-
m = MultiIndex.from_arrays([['one', 'one', 'two', 'two', 'All'],
1206-
['dull', 'shiny', 'dull', 'shiny', '']],
1206+
m = MultiIndex.from_arrays([[np.nan, np.nan, 'one', 'one', 'two',
1207+
'two', 'All'],
1208+
['dull', 'shiny', 'dull', 'shiny', 'dull',
1209+
'shiny', '']],
12071210
names=['b', 'c'])
1208-
expected = DataFrame([[1, 0, 1, 0, 2], [2, 0, 1, 1, 5],
1209-
[3, 0, 2, 1, 7]], columns=m)
1211+
expected = DataFrame([[0, 0, 1, 0, 1, 0, 2], [0, 1, 2, 0, 1, 1, 5],
1212+
[0, 1, 3, 0, 2, 1, 7]], columns=m)
12101213
expected.index = Index(['bar', 'foo', 'All'], name='a')
12111214
tm.assert_frame_equal(actual, expected)
12121215

12131216
actual = pd.crosstab([a, b], c, rownames=['a', 'b'],
12141217
colnames=['c'], margins=True, dropna=False)
1215-
m = MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo', 'All'],
1216-
['one', 'two', 'one', 'two', '']],
1218+
m = MultiIndex.from_arrays([['bar', 'bar', 'bar', 'foo', 'foo',
1219+
'foo', 'All'],
1220+
[np.nan, 'one', 'two', np.nan, 'one',
1221+
'two', '']],
12171222
names=['a', 'b'])
1218-
expected = DataFrame([[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2],
1219-
[5, 2, 7]], index=m)
1223+
expected = DataFrame([[0, 0, 0], [1, 0, 1], [1, 0, 1], [0, 1, 1],
1224+
[2, 0, 2], [1, 1, 2], [5, 2, 7]], index=m)
12201225
expected.columns = Index(['dull', 'shiny', 'All'], name='c')
12211226
tm.assert_frame_equal(actual, expected)
12221227

0 commit comments

Comments
 (0)