Skip to content

Factorize fix #11

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Mar 14, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ install:

# create our env
- cmd: conda install -q -y conda-build anaconda-client
- cmd: conda create -q -n test-environment python=%PYTHON_VERSION% coverage cython flake8 hypothesis numpy pytest pytest-cov python-dateutil pytz six
- cmd: conda create -q -n test-environment python=%PYTHON_VERSION% coverage cython flake8 hypothesis numba numpy pytest pytest-cov python-dateutil pytz six
- cmd: activate test-environment
- cmd: conda list -n test-environment

Expand Down
1 change: 1 addition & 0 deletions ci/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ dependencies:
- flake8
- ipython
- matplotlib
- numba
- numpy
- numpydoc
- pandas
Expand Down
1 change: 1 addition & 0 deletions ci/install-travis.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ conda install -q \
cython \
flake8 \
hypothesis \
numba \
numpy \
pytest \
pytest-cov \
Expand Down
5 changes: 3 additions & 2 deletions conda-recipes/cyberpandas/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@ requirements:
- setuptools >=3.3

run:
- ipaddress # [py27]
- numba
- pandas
- python
- setuptools >=3.3
- pandas
- ipaddress # [py27]

test:
imports:
Expand Down
32 changes: 32 additions & 0 deletions cyberpandas/_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Utilities for working with IP address data."""
import struct

import numba
import six


Expand Down Expand Up @@ -31,3 +32,34 @@ def combine(hi, lo):
# type: (int, int) -> int
"""Combine the hi and lo bytes into the final ip address."""
return (hi << 64) + lo


@numba.jit(nopython=True)
def refactorize(arr, first_na, na_sentinel=-1):
"""
Modify `arr` *inplace* to match pandas' factorization rules.

This detects the code missing values were assigned, sets
those to `na_sentinel`, and shifts codes above that value
down by 1 to fill the hole.

Parameters
----------
arr : ndarray
First return value from :meth:`pandas.factorize`
first_na : int
The index location of the first missing value
na_sentinel : int, default -1
Value to set for missing values.
"""
# A naive benchmark shows that this gets ~285x speedup
# with numba on a 10,000 element array.
na_code = arr[first_na]
for i in range(len(arr)):
val = arr[i]
if val == na_code:
arr[i] = na_sentinel
elif val > na_code:
arr[i] -= 1

return arr
79 changes: 67 additions & 12 deletions cyberpandas/ip_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from ._accessor import (DelegatedMethod, DelegatedProperty,
delegated_method)
from ._utils import combine, pack, unpack
from ._utils import combine, pack, unpack, refactorize
from .common import _U8_MAX, _IPv4_MAX
from .parser import _to_ipaddress_pyint, _as_ip_object

Expand Down Expand Up @@ -69,6 +69,10 @@ def __init__(self, values):
values = _to_ip_array(values) # TODO: avoid potential copy
self.data = values

@classmethod
def _constructor_from_sequence(cls, scalars):
return cls(scalars)

# -------------------------------------------------------------------------
# Pandas Interface
# -------------------------------------------------------------------------
Expand Down Expand Up @@ -287,7 +291,7 @@ def equals(self, other):

def isna(self):
ips = self.data
return (ips['lo'] == 0) & (ips['lo'] - ips['hi'] == 0)
return (ips['lo'] == 0) & (ips['hi'] == 0)

def argsort(self, axis=-1, kind='quicksort', order=None):
return self.data.argsort()
Expand Down Expand Up @@ -460,16 +464,67 @@ def unique(self):
data = self.data.take(np.sort(indices))
return self._from_ndarray(data)

def factorize(self, sort=False):
# XXX: Verify this, check for better algo
uniques, indices, labels = np.unique(self.data,
return_index=True,
return_inverse=True)
if not sort:
# Unsort, since np.unique sorts
uniques = self._from_ndarray(self.data.take(np.sort(indices)))
labels = np.argsort(uniques.data).take(labels)
return labels, uniques
def factorize(self, na_sentinel=-1):
"""Factorize an IPArray into integer labels and unique values.

Calling :meth:`pandas.Series.factorize` or :meth:`pandas.factorize`
will dispatch to this method.

Parameters
----------
na_sentinel : int, default -1
The value in `labels` to use for indicating missing values in
`self`.

Returns
-------
labels : ndarray
An integer-type ndarray the same length as `self`. Each newly-
observed value in `self` will be assigned the next integer.
Missing values in self are assigned `na_sentinel`.
uniques : IPArray
The unique values in `self` in order of appereance, not including
the missing value ``IPv4Address('0.0.0.0')``.

See Also
--------
pandas.factorize, pandas.Series.factorize

Examples
--------
>>> arr = IPArray([2, 2, 0, 1, 2, 2**64 + 1])
>>> arr
IPArray(['0.0.0.2', '0.0.0.2', '0.0.0.0', '0.0.0.1',
'0.0.0.2', '::1:0:0:0:1'])

>>> labels, uniques = arr.factorize()
>>> labels
array([ 0, 0, -1, 1, 0, 2])

Notice that `uniques` does not include the missing value.
>>> uniques
IPArray(['0.0.0.2', '0.0.0.1', '::1:0:0:0:1'])
"""
# OK, so here's the plan.
# Start with factorizing `self.data`, which has two unfortunate issues
# 1. Requires casting to object.
# 2. Gets the NA logic wrong, since (0, 0) isn't NA to pandas.
# For now, we can't help with 1. Maybe someday.
# For 2, we can "fix" things with a little post-factorization cleanup.
l, u = pd.factorize(self.data)
mask = self.isna()
any_na = mask.any()

if any_na:
first_na = mask.argmax()
refactorize(l, first_na, na_sentinel=na_sentinel) # inplace op

# u is an ndarray of tuples. Go to our record type, then an IPArray
u2 = type(self)((u.astype(self.dtype._record_type)))
# May have a missing value.
if any_na:
u2 = u2[~u2.isna()]
return l, u2


# -----
Expand Down
10 changes: 10 additions & 0 deletions cyberpandas/test_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,16 @@ def data_missing_for_sorting():
return ip.IPArray([2 ** 64 + 1, 0, 1])


@pytest.fixture
def data_for_grouping():
b = 1
a = 2 ** 32 + 1
c = 2 ** 32 + 10
return ip.IPArray([
b, b, 0, 0, a, a, b, c
])


@pytest.fixture
def na_cmp():
"""Binary operator for comparing NA values.
Expand Down
11 changes: 3 additions & 8 deletions cyberpandas/test_ip.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,15 +290,10 @@ def test_unique():
tm.assert_numpy_array_equal(result, expected)


@pytest.mark.parametrize('sort', [
pytest.param(True, marks=pytest.mark.xfail(reason="Upstream sort_values")),
False
])
def test_factorize(sort):
def test_factorize():
arr = ip.IPArray([3, 3, 1, 2, 3, _U8_MAX + 1])
labels, uniques = arr.factorize(sort=sort)
expected_labels, expected_uniques = pd.factorize(arr.astype(object),
sort=sort)
labels, uniques = arr.factorize()
expected_labels, expected_uniques = pd.factorize(arr.astype(object))

assert isinstance(uniques, ip.IPArray)

Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,6 @@
packages=find_packages(),
install_requires=[
'pandas>=0.23.0.dev0',
'numba',
]
)