Fixed factorize for MACArray

TomAugspurger · TomAugspurger · commit fa420adf1080 · 2018-03-20T15:58:46.000-05:00
Relies on pandas-dev/pandas#19957
diff --git a/cyberpandas/base.py b/cyberpandas/base.py
@@ -2,11 +2,8 @@
 
 import numpy as np
 
-import pandas as pd
 from pandas.core.arrays import ExtensionArray
 
-from ._utils import refactorize
-
 
 class NumPyBackedExtensionArrayMixin(ExtensionArray):
     @property
@@ -18,6 +15,10 @@ def dtype(self):
     def _constructor_from_sequence(cls, scalars):
         return cls(scalars)
 
+    @classmethod
+    def _from_factorized(cls, values, original):
+        return cls(values)
+
     @property
     def shape(self):
         return (len(self.data),)
@@ -68,65 +69,3 @@ def unique(self):
         _, indices = np.unique(self.data, return_index=True)
         data = self.data.take(np.sort(indices))
         return self._from_ndarray(data)
-
-    def factorize(self, na_sentinel=-1):
-        """Factorize an IPArray into integer labels and unique values.
-
-        Calling :meth:`pandas.Series.factorize` or :meth:`pandas.factorize`
-        will dispatch to this method.
-
-        Parameters
-        ----------
-        na_sentinel : int, default -1
-            The value in `labels` to use for indicating missing values in
-            `self`.
-
-        Returns
-        -------
-        labels : ndarray
-            An integer-type ndarray the same length as `self`. Each newly-
-            observed value in `self` will be assigned the next integer.
-            Missing values in self are assigned `na_sentinel`.
-        uniques : IPArray
-            The unique values in `self` in order of appereance, not including
-            the missing value ``IPv4Address('0.0.0.0')``.
-
-        See Also
-        --------
-        pandas.factorize, pandas.Series.factorize
-
-        Examples
-        --------
-        >>> arr = IPArray([2, 2, 0, 1, 2, 2**64 + 1])
-        >>> arr
-        IPArray(['0.0.0.2', '0.0.0.2', '0.0.0.0', '0.0.0.1',
-                 '0.0.0.2', '::1:0:0:0:1'])
-
-        >>> labels, uniques = arr.factorize()
-        >>> labels
-        array([ 0,  0, -1,  1,  0,  2])
-
-        Notice that `uniques` does not include the missing value.
-        >>> uniques
-        IPArray(['0.0.0.2', '0.0.0.1', '::1:0:0:0:1'])
-        """
-        # OK, so here's the plan.
-        # Start with factorizing `self.data`, which has two unfortunate issues
-        # 1. Requires casting to object.
-        # 2. Gets the NA logic wrong, since (0, 0) isn't NA to pandas.
-        # For now, we can't help with 1. Maybe someday.
-        # For 2, we can "fix" things with a little post-factorization cleanup.
-        l, u = pd.factorize(self.data)
-        mask = self.isna()
-        any_na = mask.any()
-
-        if any_na:
-            first_na = mask.argmax()
-            refactorize(l, first_na, na_sentinel=na_sentinel)  # inplace op
-
-        # u is an ndarray of tuples. Go to our record type, then an IPArray
-        u2 = type(self)((u.astype(self.dtype._record_type)))
-        # May have a missing value.
-        if any_na:
-            u2 = u2[~u2.isna()]
-        return l, u2
diff --git a/cyberpandas/mac_array.py b/cyberpandas/mac_array.py
@@ -1,3 +1,5 @@
+from collections import Iterable
+
 import numpy as np
 import six
 
@@ -60,9 +62,7 @@ def _box_scalar(scalar):
         return scalar
 
     def __setitem__(self, key, value):
-        from .parser import to_ipaddress
-
-        value = to_ipaddress(value).data
+        value = to_macaddress(value)
         self.data[key] = value
 
     def __iter__(self):
@@ -126,3 +126,13 @@ def _parse(mac):
     # https://stackoverflow.com/a/36883363/1889400
     mac_int = int(mac.replace(":", "").replace("-", ""), 16)
     return mac_int
+
+
+def to_macaddress(addresses):
+    if (isinstance(addresses, six.string_types) or
+            not isinstance(addresses, Iterable)):
+        addresses = [addresses]
+
+    addresses = [_parse(mac) if isinstance(mac, six.string_types) else mac
+                 for mac in addresses]
+    return np.array(addresses, dtype='u8')
diff --git a/cyberpandas/parser.py b/cyberpandas/parser.py
@@ -57,7 +57,7 @@ def _to_int_pairs(values):
     if isinstance(values, (str, bytes, int)):
         values = ipaddress.ip_address(values)._ip
         return unpack(pack(values))
-    elif isinstance(values, np.ndarray):
+    elif isinstance(values, np.ndarray) and values.dtype != object:
         if values.ndim != 2:
             raise ValueError("'values' should be a 2-D when passing a "
                              "NumPy array.")
diff --git a/tests/mac/test_interface.py b/tests/mac/test_interface.py
@@ -31,12 +31,12 @@ def all_data(request, data, data_missing):
 
 @pytest.fixture
 def data_for_sorting():
-    return MACArray([10, 2 ** 64 + 1, 1])
+    return MACArray([10, 2 ** 64 - 1, 1])
 
 
 @pytest.fixture
 def data_missing_for_sorting():
-    return MACArray([2 ** 64 + 1, 0, 1])
+    return MACArray([2 ** 64 - 1, 0, 1])
 
 
 @pytest.fixture