diff --git a/.appveyor.yml b/.appveyor.yml index a965be5..f285d0b 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -70,7 +70,7 @@ install: # create our env - cmd: conda install -q -y conda-build anaconda-client - - cmd: conda create -q -n test-environment python=%PYTHON_VERSION% coverage cython flake8 hypothesis numpy pytest pytest-cov python-dateutil pytz six + - cmd: conda create -q -n test-environment python=%PYTHON_VERSION% coverage cython flake8 hypothesis numba numpy pytest pytest-cov python-dateutil pytz six - cmd: activate test-environment - cmd: conda list -n test-environment diff --git a/ci/environment.yml b/ci/environment.yml index c227e7e..ad7c4a2 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -7,6 +7,7 @@ dependencies: - flake8 - ipython - matplotlib + - numba - numpy - numpydoc - pandas diff --git a/ci/install-travis.sh b/ci/install-travis.sh index 48a2345..de7bcd6 100755 --- a/ci/install-travis.sh +++ b/ci/install-travis.sh @@ -32,6 +32,7 @@ conda install -q \ cython \ flake8 \ hypothesis \ + numba \ numpy \ pytest \ pytest-cov \ diff --git a/conda-recipes/cyberpandas/meta.yaml b/conda-recipes/cyberpandas/meta.yaml index fbc3116..c6ae907 100644 --- a/conda-recipes/cyberpandas/meta.yaml +++ b/conda-recipes/cyberpandas/meta.yaml @@ -16,10 +16,11 @@ requirements: - setuptools >=3.3 run: + - ipaddress # [py27] + - numba + - pandas - python - setuptools >=3.3 - - pandas - - ipaddress # [py27] test: imports: diff --git a/cyberpandas/_utils.py b/cyberpandas/_utils.py index ab6dd35..744f2d3 100644 --- a/cyberpandas/_utils.py +++ b/cyberpandas/_utils.py @@ -1,6 +1,7 @@ """Utilities for working with IP address data.""" import struct +import numba import six @@ -31,3 +32,34 @@ def combine(hi, lo): # type: (int, int) -> int """Combine the hi and lo bytes into the final ip address.""" return (hi << 64) + lo + + +@numba.jit(nopython=True) +def refactorize(arr, first_na, na_sentinel=-1): + """ + Modify `arr` *inplace* to match pandas' factorization rules. + + This detects the code missing values were assigned, sets + those to `na_sentinel`, and shifts codes above that value + down by 1 to fill the hole. + + Parameters + ---------- + arr : ndarray + First return value from :meth:`pandas.factorize` + first_na : int + The index location of the first missing value + na_sentinel : int, default -1 + Value to set for missing values. + """ + # A naive benchmark shows that this gets ~285x speedup + # with numba on a 10,000 element array. + na_code = arr[first_na] + for i in range(len(arr)): + val = arr[i] + if val == na_code: + arr[i] = na_sentinel + elif val > na_code: + arr[i] -= 1 + + return arr diff --git a/cyberpandas/ip_array.py b/cyberpandas/ip_array.py index f3a64ea..f37e54c 100644 --- a/cyberpandas/ip_array.py +++ b/cyberpandas/ip_array.py @@ -12,7 +12,7 @@ from ._accessor import (DelegatedMethod, DelegatedProperty, delegated_method) -from ._utils import combine, pack, unpack +from ._utils import combine, pack, unpack, refactorize from .common import _U8_MAX, _IPv4_MAX from .parser import _to_ipaddress_pyint, _as_ip_object @@ -69,6 +69,10 @@ def __init__(self, values): values = _to_ip_array(values) # TODO: avoid potential copy self.data = values + @classmethod + def _constructor_from_sequence(cls, scalars): + return cls(scalars) + # ------------------------------------------------------------------------- # Pandas Interface # ------------------------------------------------------------------------- @@ -287,7 +291,7 @@ def equals(self, other): def isna(self): ips = self.data - return (ips['lo'] == 0) & (ips['lo'] - ips['hi'] == 0) + return (ips['lo'] == 0) & (ips['hi'] == 0) def argsort(self, axis=-1, kind='quicksort', order=None): return self.data.argsort() @@ -460,16 +464,67 @@ def unique(self): data = self.data.take(np.sort(indices)) return self._from_ndarray(data) - def factorize(self, sort=False): - # XXX: Verify this, check for better algo - uniques, indices, labels = np.unique(self.data, - return_index=True, - return_inverse=True) - if not sort: - # Unsort, since np.unique sorts - uniques = self._from_ndarray(self.data.take(np.sort(indices))) - labels = np.argsort(uniques.data).take(labels) - return labels, uniques + def factorize(self, na_sentinel=-1): + """Factorize an IPArray into integer labels and unique values. + + Calling :meth:`pandas.Series.factorize` or :meth:`pandas.factorize` + will dispatch to this method. + + Parameters + ---------- + na_sentinel : int, default -1 + The value in `labels` to use for indicating missing values in + `self`. + + Returns + ------- + labels : ndarray + An integer-type ndarray the same length as `self`. Each newly- + observed value in `self` will be assigned the next integer. + Missing values in self are assigned `na_sentinel`. + uniques : IPArray + The unique values in `self` in order of appereance, not including + the missing value ``IPv4Address('0.0.0.0')``. + + See Also + -------- + pandas.factorize, pandas.Series.factorize + + Examples + -------- + >>> arr = IPArray([2, 2, 0, 1, 2, 2**64 + 1]) + >>> arr + IPArray(['0.0.0.2', '0.0.0.2', '0.0.0.0', '0.0.0.1', + '0.0.0.2', '::1:0:0:0:1']) + + >>> labels, uniques = arr.factorize() + >>> labels + array([ 0, 0, -1, 1, 0, 2]) + + Notice that `uniques` does not include the missing value. + >>> uniques + IPArray(['0.0.0.2', '0.0.0.1', '::1:0:0:0:1']) + """ + # OK, so here's the plan. + # Start with factorizing `self.data`, which has two unfortunate issues + # 1. Requires casting to object. + # 2. Gets the NA logic wrong, since (0, 0) isn't NA to pandas. + # For now, we can't help with 1. Maybe someday. + # For 2, we can "fix" things with a little post-factorization cleanup. + l, u = pd.factorize(self.data) + mask = self.isna() + any_na = mask.any() + + if any_na: + first_na = mask.argmax() + refactorize(l, first_na, na_sentinel=na_sentinel) # inplace op + + # u is an ndarray of tuples. Go to our record type, then an IPArray + u2 = type(self)((u.astype(self.dtype._record_type))) + # May have a missing value. + if any_na: + u2 = u2[~u2.isna()] + return l, u2 # ----- diff --git a/cyberpandas/test_interface.py b/cyberpandas/test_interface.py index 7bffe8d..1edfc8f 100644 --- a/cyberpandas/test_interface.py +++ b/cyberpandas/test_interface.py @@ -38,6 +38,16 @@ def data_missing_for_sorting(): return ip.IPArray([2 ** 64 + 1, 0, 1]) +@pytest.fixture +def data_for_grouping(): + b = 1 + a = 2 ** 32 + 1 + c = 2 ** 32 + 10 + return ip.IPArray([ + b, b, 0, 0, a, a, b, c + ]) + + @pytest.fixture def na_cmp(): """Binary operator for comparing NA values. diff --git a/cyberpandas/test_ip.py b/cyberpandas/test_ip.py index dc6061b..6bb9d5d 100644 --- a/cyberpandas/test_ip.py +++ b/cyberpandas/test_ip.py @@ -290,15 +290,10 @@ def test_unique(): tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize('sort', [ - pytest.param(True, marks=pytest.mark.xfail(reason="Upstream sort_values")), - False -]) -def test_factorize(sort): +def test_factorize(): arr = ip.IPArray([3, 3, 1, 2, 3, _U8_MAX + 1]) - labels, uniques = arr.factorize(sort=sort) - expected_labels, expected_uniques = pd.factorize(arr.astype(object), - sort=sort) + labels, uniques = arr.factorize() + expected_labels, expected_uniques = pd.factorize(arr.astype(object)) assert isinstance(uniques, ip.IPArray) diff --git a/setup.py b/setup.py index f3916e1..c448242 100644 --- a/setup.py +++ b/setup.py @@ -23,5 +23,6 @@ packages=find_packages(), install_requires=[ 'pandas>=0.23.0.dev0', + 'numba', ] )