ContinuumIO · TomAugspurger · Mar 14, 2018 · Mar 14, 2018 · Mar 14, 2018 · Mar 14, 2018
diff --git a/.appveyor.yml b/.appveyor.yml
@@ -70,7 +70,7 @@ install:
 
   # create our env
   - cmd: conda install -q -y conda-build anaconda-client
-  - cmd: conda create -q -n test-environment python=%PYTHON_VERSION% coverage cython flake8 hypothesis numpy pytest pytest-cov python-dateutil pytz six
+  - cmd: conda create -q -n test-environment python=%PYTHON_VERSION% coverage cython flake8 hypothesis numba numpy pytest pytest-cov python-dateutil pytz six
   - cmd: activate test-environment
   - cmd: conda list -n test-environment
 

diff --git a/ci/environment.yml b/ci/environment.yml
@@ -7,6 +7,7 @@ dependencies:
   - flake8
   - ipython
   - matplotlib
+  - numba
   - numpy
   - numpydoc
   - pandas

diff --git a/ci/install-travis.sh b/ci/install-travis.sh
@@ -32,6 +32,7 @@ conda install -q \
       cython \
       flake8 \
       hypothesis \
+      numba \
       numpy \
       pytest \
       pytest-cov \

diff --git a/conda-recipes/cyberpandas/meta.yaml b/conda-recipes/cyberpandas/meta.yaml
@@ -16,10 +16,11 @@ requirements:
     - setuptools >=3.3
 
   run:
+    - ipaddress # [py27]
+    - numba
+    - pandas
     - python
     - setuptools >=3.3
-    - pandas
-    - ipaddress # [py27]
 
 test:
   imports:

diff --git a/cyberpandas/_utils.py b/cyberpandas/_utils.py
@@ -1,6 +1,7 @@
 """Utilities for working with IP address data."""
 import struct
 
+import numba
 import six
 
 
@@ -31,3 +32,34 @@ def combine(hi, lo):
     # type: (int, int) -> int
     """Combine the hi and lo bytes into the final ip address."""
     return (hi << 64) + lo
+
+
+@numba.jit(nopython=True)
+def refactorize(arr, first_na, na_sentinel=-1):
+    """
+    Modify `arr` *inplace* to match pandas' factorization rules.
+
+    This detects the code missing values were assigned, sets
+    those to `na_sentinel`, and shifts codes above that value
+    down by 1 to fill the hole.
+
+    Parameters
+    ----------
+    arr : ndarray
+        First return value from :meth:`pandas.factorize`
+    first_na : int
+        The index location of the first missing value
+    na_sentinel : int, default -1
+        Value to set for missing values.
+    """
+    # A naive benchmark shows that this gets ~285x speedup
+    # with numba on a 10,000 element array.
+    na_code = arr[first_na]
+    for i in range(len(arr)):
+        val = arr[i]
+        if val == na_code:
+            arr[i] = na_sentinel
+        elif val > na_code:
+            arr[i] -= 1
+
+    return arr
diff --git a/cyberpandas/ip_array.py b/cyberpandas/ip_array.py
@@ -12,7 +12,7 @@
 
 from ._accessor import (DelegatedMethod, DelegatedProperty,
                         delegated_method)
-from ._utils import combine, pack, unpack
+from ._utils import combine, pack, unpack, refactorize
 from .common import _U8_MAX, _IPv4_MAX
 from .parser import _to_ipaddress_pyint, _as_ip_object
 
@@ -69,6 +69,10 @@ def __init__(self, values):
         values = _to_ip_array(values)  # TODO: avoid potential copy
         self.data = values
 
+    @classmethod
+    def _constructor_from_sequence(cls, scalars):
+        return cls(scalars)
+
     # -------------------------------------------------------------------------
     # Pandas Interface
     # -------------------------------------------------------------------------
@@ -287,7 +291,7 @@ def equals(self, other):
 
     def isna(self):
         ips = self.data
-        return (ips['lo'] == 0) & (ips['lo'] - ips['hi'] == 0)
+        return (ips['lo'] == 0) & (ips['hi'] == 0)
 
     def argsort(self, axis=-1, kind='quicksort', order=None):
         return self.data.argsort()
@@ -460,16 +464,67 @@ def unique(self):
         data = self.data.take(np.sort(indices))
         return self._from_ndarray(data)
 
-    def factorize(self, sort=False):
-        # XXX: Verify this, check for better algo
-        uniques, indices, labels = np.unique(self.data,
-                                             return_index=True,
-                                             return_inverse=True)
-        if not sort:
-            # Unsort, since np.unique sorts
-            uniques = self._from_ndarray(self.data.take(np.sort(indices)))
-            labels = np.argsort(uniques.data).take(labels)
-        return labels, uniques
+    def factorize(self, na_sentinel=-1):
+        """Factorize an IPArray into integer labels and unique values.
+
+        Calling :meth:`pandas.Series.factorize` or :meth:`pandas.factorize`
+        will dispatch to this method.
+
+        Parameters
+        ----------
+        na_sentinel : int, default -1
+            The value in `labels` to use for indicating missing values in
+            `self`.
+
+        Returns
+        -------
+        labels : ndarray
+            An integer-type ndarray the same length as `self`. Each newly-
+            observed value in `self` will be assigned the next integer.
+            Missing values in self are assigned `na_sentinel`.
+        uniques : IPArray
+            The unique values in `self` in order of appereance, not including
+            the missing value ``IPv4Address('0.0.0.0')``.
+
+        See Also
+        --------
+        pandas.factorize, pandas.Series.factorize
+
+        Examples
+        --------
+        >>> arr = IPArray([2, 2, 0, 1, 2, 2**64 + 1])
+        >>> arr
+        IPArray(['0.0.0.2', '0.0.0.2', '0.0.0.0', '0.0.0.1',
+                 '0.0.0.2', '::1:0:0:0:1'])
+
+        >>> labels, uniques = arr.factorize()
+        >>> labels
+        array([ 0,  0, -1,  1,  0,  2])
+
+        Notice that `uniques` does not include the missing value.
+        >>> uniques
+        IPArray(['0.0.0.2', '0.0.0.1', '::1:0:0:0:1'])
+        """
+        # OK, so here's the plan.
+        # Start with factorizing `self.data`, which has two unfortunate issues
+        # 1. Requires casting to object.
+        # 2. Gets the NA logic wrong, since (0, 0) isn't NA to pandas.
+        # For now, we can't help with 1. Maybe someday.
+        # For 2, we can "fix" things with a little post-factorization cleanup.
+        l, u = pd.factorize(self.data)
+        mask = self.isna()
+        any_na = mask.any()
+
+        if any_na:
+            first_na = mask.argmax()
+            refactorize(l, first_na, na_sentinel=na_sentinel)  # inplace op
+
+        # u is an ndarray of tuples. Go to our record type, then an IPArray
+        u2 = type(self)((u.astype(self.dtype._record_type)))
+        # May have a missing value.
+        if any_na:
+            u2 = u2[~u2.isna()]
+        return l, u2
 
 
 # -----

diff --git a/cyberpandas/test_interface.py b/cyberpandas/test_interface.py
@@ -38,6 +38,16 @@ def data_missing_for_sorting():
     return ip.IPArray([2 ** 64 + 1, 0, 1])
 
 
+@pytest.fixture
+def data_for_grouping():
+    b = 1
+    a = 2 ** 32 + 1
+    c = 2 ** 32 + 10
+    return ip.IPArray([
+        b, b, 0, 0, a, a, b, c
+    ])
+
+
 @pytest.fixture
 def na_cmp():
     """Binary operator for comparing NA values.

diff --git a/cyberpandas/test_ip.py b/cyberpandas/test_ip.py
@@ -290,15 +290,10 @@ def test_unique():
     tm.assert_numpy_array_equal(result, expected)
 
 
-@pytest.mark.parametrize('sort', [
-    pytest.param(True, marks=pytest.mark.xfail(reason="Upstream sort_values")),
-    False
-])
-def test_factorize(sort):
+def test_factorize():
     arr = ip.IPArray([3, 3, 1, 2, 3, _U8_MAX + 1])
-    labels, uniques = arr.factorize(sort=sort)
-    expected_labels, expected_uniques = pd.factorize(arr.astype(object),
-                                                     sort=sort)
+    labels, uniques = arr.factorize()
+    expected_labels, expected_uniques = pd.factorize(arr.astype(object))
 
     assert isinstance(uniques, ip.IPArray)
 

diff --git a/setup.py b/setup.py
@@ -23,5 +23,6 @@
     packages=find_packages(),
     install_requires=[
         'pandas>=0.23.0.dev0',
+        'numba',
     ]
 )
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,6 +7,7 @@ dependencies: @@
       - flake8
       - ipython
       - matplotlib
+      - numba
       - numpy
       - numpydoc
       - pandas
@@ Expand Down @@