2
2
3
3
import numpy as np
4
4
5
- import pandas as pd
6
5
from pandas .core .arrays import ExtensionArray
7
6
8
- from ._utils import refactorize
9
-
10
7
11
8
class NumPyBackedExtensionArrayMixin (ExtensionArray ):
12
9
@property
@@ -18,6 +15,10 @@ def dtype(self):
18
15
def _constructor_from_sequence (cls , scalars ):
19
16
return cls (scalars )
20
17
18
+ @classmethod
19
+ def _from_factorized (cls , values , original ):
20
+ return cls (values )
21
+
21
22
@property
22
23
def shape (self ):
23
24
return (len (self .data ),)
@@ -68,65 +69,3 @@ def unique(self):
68
69
_ , indices = np .unique (self .data , return_index = True )
69
70
data = self .data .take (np .sort (indices ))
70
71
return self ._from_ndarray (data )
71
-
72
- def factorize (self , na_sentinel = - 1 ):
73
- """Factorize an IPArray into integer labels and unique values.
74
-
75
- Calling :meth:`pandas.Series.factorize` or :meth:`pandas.factorize`
76
- will dispatch to this method.
77
-
78
- Parameters
79
- ----------
80
- na_sentinel : int, default -1
81
- The value in `labels` to use for indicating missing values in
82
- `self`.
83
-
84
- Returns
85
- -------
86
- labels : ndarray
87
- An integer-type ndarray the same length as `self`. Each newly-
88
- observed value in `self` will be assigned the next integer.
89
- Missing values in self are assigned `na_sentinel`.
90
- uniques : IPArray
91
- The unique values in `self` in order of appereance, not including
92
- the missing value ``IPv4Address('0.0.0.0')``.
93
-
94
- See Also
95
- --------
96
- pandas.factorize, pandas.Series.factorize
97
-
98
- Examples
99
- --------
100
- >>> arr = IPArray([2, 2, 0, 1, 2, 2**64 + 1])
101
- >>> arr
102
- IPArray(['0.0.0.2', '0.0.0.2', '0.0.0.0', '0.0.0.1',
103
- '0.0.0.2', '::1:0:0:0:1'])
104
-
105
- >>> labels, uniques = arr.factorize()
106
- >>> labels
107
- array([ 0, 0, -1, 1, 0, 2])
108
-
109
- Notice that `uniques` does not include the missing value.
110
- >>> uniques
111
- IPArray(['0.0.0.2', '0.0.0.1', '::1:0:0:0:1'])
112
- """
113
- # OK, so here's the plan.
114
- # Start with factorizing `self.data`, which has two unfortunate issues
115
- # 1. Requires casting to object.
116
- # 2. Gets the NA logic wrong, since (0, 0) isn't NA to pandas.
117
- # For now, we can't help with 1. Maybe someday.
118
- # For 2, we can "fix" things with a little post-factorization cleanup.
119
- l , u = pd .factorize (self .data )
120
- mask = self .isna ()
121
- any_na = mask .any ()
122
-
123
- if any_na :
124
- first_na = mask .argmax ()
125
- refactorize (l , first_na , na_sentinel = na_sentinel ) # inplace op
126
-
127
- # u is an ndarray of tuples. Go to our record type, then an IPArray
128
- u2 = type (self )((u .astype (self .dtype ._record_type )))
129
- # May have a missing value.
130
- if any_na :
131
- u2 = u2 [~ u2 .isna ()]
132
- return l , u2
0 commit comments