diff --git a/cyberpandas/ip_array.py b/cyberpandas/ip_array.py index 587b8b7..59ab389 100644 --- a/cyberpandas/ip_array.py +++ b/cyberpandas/ip_array.py @@ -452,28 +452,20 @@ def index_type(self): def unique(self): # type: () -> ExtensionArray - pass + # https://github.com/pandas-dev/pandas/pull/19869 + _, indices = np.unique(self.data, return_index=True) + data = self.data.take(np.sort(indices)) + return self._from_ndarray(data) - def _factorize(self, sort=False): + def factorize(self, sort=False): # XXX: Verify this, check for better algo - # astype to avoid endianness issues in pd.factorize - a, _ = pd.factorize(self.data['lo'].astype('u8')) - b, _ = pd.factorize(self.data['hi'].astype('u8')) - - labels = np.bitwise_xor.reduce( - np.concatenate([a.reshape(-1, 1), - b.reshape(-1, 1)], axis=1), - axis=1 - ) - - # TODO: refactor into a .unique - # TODO: Handle empty, scalar, etc. - mask = np.zeros(len(labels), dtype=bool) - mask[0] = True - inner_mask = (labels[1:] - labels[:-1]) != 0 - mask[1:] = inner_mask - - uniques = self[mask] + uniques, indices, labels = np.unique(self.data, + return_index=True, + return_inverse=True) + if not sort: + # Unsort, since np.unique sorts + uniques = self._from_ndarray(self.data.take(np.sort(indices))) + labels = np.argsort(uniques.data).take(labels) return labels, uniques diff --git a/cyberpandas/test_ip.py b/cyberpandas/test_ip.py index 435e042..dc6061b 100644 --- a/cyberpandas/test_ip.py +++ b/cyberpandas/test_ip.py @@ -278,3 +278,30 @@ def test_bytes_roundtrip(): result = ip.IPArray.from_bytes(bytestring) assert result.equals(arr) + + +def test_unique(): + arr = ip.IPArray([3, 3, 1, 2, 3, _U8_MAX + 1]) + result = arr.unique() + assert isinstance(result, ip.IPArray) + + result = result.astype(object) + expected = pd.unique(arr.astype(object)) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize('sort', [ + pytest.param(True, marks=pytest.mark.xfail(reason="Upstream sort_values")), + False +]) +def test_factorize(sort): + arr = ip.IPArray([3, 3, 1, 2, 3, _U8_MAX + 1]) + labels, uniques = arr.factorize(sort=sort) + expected_labels, expected_uniques = pd.factorize(arr.astype(object), + sort=sort) + + assert isinstance(uniques, ip.IPArray) + + uniques = uniques.astype(object) + tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_numpy_array_equal(uniques, expected_uniques)