From 053c66923b84af19524097f092b0ea61073f105c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 15 Mar 2018 13:50:50 -0500 Subject: [PATCH 1/6] REF: Base class for array --- cyberpandas/__init__.py | 1 - cyberpandas/base.py | 132 +++++++++++++ cyberpandas/ip_array.py | 423 +++++++++++++--------------------------- 3 files changed, 267 insertions(+), 289 deletions(-) create mode 100644 cyberpandas/base.py diff --git a/cyberpandas/__init__.py b/cyberpandas/__init__.py index c9f2712..c617d35 100644 --- a/cyberpandas/__init__.py +++ b/cyberpandas/__init__.py @@ -4,7 +4,6 @@ IPType, IPArray, IPAccessor, - IPAddressIndex, ) from .parser import to_ipaddress # noqa diff --git a/cyberpandas/base.py b/cyberpandas/base.py new file mode 100644 index 0000000..429cb96 --- /dev/null +++ b/cyberpandas/base.py @@ -0,0 +1,132 @@ +import operator + +import numpy as np + +import pandas as pd +from pandas.core.arrays import ExtensionArray + +from ._utils import refactorize + + +class NumPyBackedExtensionArrayMixin(ExtensionArray): + @property + def dtype(self): + """The dtype for this extension array, IPType""" + return self._dtype + + @classmethod + def _constructor_from_sequence(cls, scalars): + return cls(scalars) + + @property + def shape(self): + return (len(self.data),) + + def __len__(self): + return len(self.data) + + def __getitem__(self, *args): + result = operator.getitem(self.data, *args) + if isinstance(result, tuple): + return self._box_scalar(result) + elif result.ndim == 0: + return self._box_scalar(result.item()) + else: + return type(self)(result) + + def setitem(self, indexer, value): + """Set the 'value' inplace. + """ + # I think having a separate than __setitem__ is good + # since we have to return here, but __setitem__ doesn't. + self[indexer] = value + return self + + @property + def nbytes(self): + return self._itemsize * len(self) + + def _formatting_values(self): + return np.array(self._format_values(), dtype='object') + + def copy(self, deep=False): + return type(self)(self.data.copy()) + + @classmethod + def _concat_same_type(cls, to_concat): + return cls(np.concatenate([array.data for array in to_concat])) + + def tolist(self): + return self.data.tolist() + + def argsort(self, axis=-1, kind='quicksort', order=None): + return self.data.argsort() + + def unique(self): + # type: () -> ExtensionArray + # https://github.com/pandas-dev/pandas/pull/19869 + _, indices = np.unique(self.data, return_index=True) + data = self.data.take(np.sort(indices)) + return self._from_ndarray(data) + + def factorize(self, na_sentinel=-1): + """Factorize an IPArray into integer labels and unique values. + + Calling :meth:`pandas.Series.factorize` or :meth:`pandas.factorize` + will dispatch to this method. + + Parameters + ---------- + na_sentinel : int, default -1 + The value in `labels` to use for indicating missing values in + `self`. + + Returns + ------- + labels : ndarray + An integer-type ndarray the same length as `self`. Each newly- + observed value in `self` will be assigned the next integer. + Missing values in self are assigned `na_sentinel`. + uniques : IPArray + The unique values in `self` in order of appereance, not including + the missing value ``IPv4Address('0.0.0.0')``. + + See Also + -------- + pandas.factorize, pandas.Series.factorize + + Examples + -------- + >>> arr = IPArray([2, 2, 0, 1, 2, 2**64 + 1]) + >>> arr + IPArray(['0.0.0.2', '0.0.0.2', '0.0.0.0', '0.0.0.1', + '0.0.0.2', '::1:0:0:0:1']) + + >>> labels, uniques = arr.factorize() + >>> labels + array([ 0, 0, -1, 1, 0, 2]) + + Notice that `uniques` does not include the missing value. + >>> uniques + IPArray(['0.0.0.2', '0.0.0.1', '::1:0:0:0:1']) + """ + # OK, so here's the plan. + # Start with factorizing `self.data`, which has two unfortunate issues + # 1. Requires casting to object. + # 2. Gets the NA logic wrong, since (0, 0) isn't NA to pandas. + # For now, we can't help with 1. Maybe someday. + # For 2, we can "fix" things with a little post-factorization cleanup. + l, u = pd.factorize(self.data) + mask = self.isna() + any_na = mask.any() + + if any_na: + first_na = mask.argmax() + refactorize(l, first_na, na_sentinel=na_sentinel) # inplace op + + # u is an ndarray of tuples. Go to our record type, then an IPArray + u2 = type(self)((u.astype(self.dtype._record_type))) + # May have a missing value. + if any_na: + u2 = u2[~u2.isna()] + return l, u2 diff --git a/cyberpandas/ip_array.py b/cyberpandas/ip_array.py index f37e54c..baf592e 100644 --- a/cyberpandas/ip_array.py +++ b/cyberpandas/ip_array.py @@ -7,12 +7,12 @@ import numpy as np import pandas as pd # TODO: public API -from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.dtypes import ExtensionDtype from ._accessor import (DelegatedMethod, DelegatedProperty, delegated_method) from ._utils import combine, pack, unpack, refactorize +from .base import NumPyBackedExtensionArrayMixin from .common import _U8_MAX, _IPv4_MAX from .parser import _to_ipaddress_pyint, _as_ip_object @@ -51,7 +51,7 @@ def construct_from_string(cls, string): # ----------------------------------------------------------------------------- -class IPArray(ExtensionArray): +class IPArray(NumPyBackedExtensionArrayMixin): """Holder for IP Addresses.""" # A note on the internal data layout. IPv6 addresses require 128 bits, # which is more than a uint64 can store. So we use a NumPy structured array @@ -60,6 +60,7 @@ class IPArray(ExtensionArray): # all IP traffic is big-endian. __array_priority__ = 1000 _dtype = IPType() + _itemsize = 16 ndim = 1 can_hold_na = True @@ -69,123 +70,6 @@ def __init__(self, values): values = _to_ip_array(values) # TODO: avoid potential copy self.data = values - @classmethod - def _constructor_from_sequence(cls, scalars): - return cls(scalars) - - # ------------------------------------------------------------------------- - # Pandas Interface - # ------------------------------------------------------------------------- - @property - def dtype(self): - """The dtype for this extension array, IPType""" - return self._dtype - - @property - def shape(self): - """A length-tuple with the length of the array.""" - return (len(self.data),) - - @property - def nbytes(self): - """The number of bytes taken to store this array. - - It takes 16 bytes to store each addresses. - """ - return 16 * len(self) - - def take(self, indexer, allow_fill=True, fill_value=None): - mask = indexer == -1 - result = self.data.take(indexer) - result[mask] = unpack(pack(int(self.na_value))) - return type(self)(result) - - def _formatting_values(self): - return np.array(self._format_values(), dtype='object') - - @classmethod - def _concat_same_type(cls, to_concat): - return cls(np.concatenate([array.data for array in to_concat])) - - def take_nd(self, indexer, allow_fill=True, fill_value=None): - return self.take(indexer, allow_fill=allow_fill, fill_value=fill_value) - - def copy(self, deep=False): - return type(self)(self.data.copy()) - - # ------------------------------------------------------------------------- - # Iterator / Sequence interface - # ------------------------------------------------------------------------- - def __len__(self): - return len(self.data) - - def __getitem__(self, *args): - result = operator.getitem(self.data, *args) - if isinstance(result, tuple): - return ipaddress.ip_address(combine(*result)) - elif isinstance(result, np.void): - result = result.item() - return ipaddress.ip_address(combine(*result)) - else: - return type(self)(result) - - def __setitem__(self, key, value): - from .parser import to_ipaddress - - value = to_ipaddress(value).data - self.data[key] = value - - def __iter__(self): - return iter(self.to_pyipaddress()) - - @property - def na_value(self): - return self.dtype.na_value - - def to_pyipaddress(self): - import ipaddress - return [ipaddress.ip_address(x) for x in self._format_values()] - - def to_pyints(self): - return [combine(*map(int, x)) for x in self.data] - - def to_bytes(self): - """Serialize the IPArray as a Python bytestring. - - Examples - -------- - >>> arr = IPArray([10, 20]) - >>> arr.to_bytes() - b'\x00\x00\...x00\x02' - """ - return self.data.tobytes() - - def __repr__(self): - formatted = self._format_values() - return "IPArray({!r})".format(formatted) - - def _format_values(self): - formatted = [] - # TODO: perf - for i in range(len(self)): - hi, lo = self.data[i] - if lo == -1: - formatted.append("NA") - elif hi == 0 and lo <= _IPv4_MAX: - formatted.append(ipaddress.IPv4Address._string_from_ip_int( - int(lo))) - elif hi == 0: - formatted.append(ipaddress.IPv6Address._string_from_ip_int( - int(lo))) - else: - # TODO: - formatted.append(ipaddress.IPv6Address._string_from_ip_int( - (int(hi) << 64) + int(lo))) - return formatted - - def tolist(self): - return self.data.tolist() - @classmethod def from_pyints(cls, values): # type: T.Sequence[int]) -> 'IPArray' @@ -244,6 +128,90 @@ def _from_ndarray(cls, data, copy=False): new.data = data return new + # ------------------------------------------------------------------------- + # Properties + # ------------------------------------------------------------------------- + @property + def na_value(self): + return self.dtype.na_value + + def take(self, indexer, allow_fill=True, fill_value=None): + mask = indexer == -1 + result = self.data.take(indexer) + result[mask] = unpack(pack(int(self.na_value))) + return type(self)(result) # TODO: check for copy + + # ------------------------------------------------------------------------- + # Interfaces + # ------------------------------------------------------------------------- + + def __repr__(self): + formatted = self._format_values() + return "IPArray({!r})".format(formatted) + + def _format_values(self): + formatted = [] + # TODO: perf + for i in range(len(self)): + hi, lo = self.data[i] + if lo == -1: + formatted.append("NA") + elif hi == 0 and lo <= _IPv4_MAX: + formatted.append(ipaddress.IPv4Address._string_from_ip_int( + int(lo))) + elif hi == 0: + formatted.append(ipaddress.IPv6Address._string_from_ip_int( + int(lo))) + else: + # TODO: + formatted.append(ipaddress.IPv6Address._string_from_ip_int( + (int(hi) << 64) + int(lo))) + return formatted + + @staticmethod + def _box_scalar(scalar): + return ipaddress.ip_address(combine(*scalar)) + + @property + def _parser(self): + from .parser import to_ipaddress + return to_ipaddress + + def __setitem__(self, key, value): + from .parser import to_ipaddress + + value = to_ipaddress(value).data + self.data[key] = value + + def __iter__(self): + return iter(self.to_pyipaddress()) + + # ------------------------------------------------------------------------ + # Serializaiton / Export + # ------------------------------------------------------------------------ + + def to_pyipaddress(self): + import ipaddress + return [ipaddress.ip_address(x) for x in self._format_values()] + + def to_pyints(self): + return [combine(*map(int, x)) for x in self.data] + + def to_bytes(self): + """Serialize the IPArray as a Python bytestring. + + Examples + -------- + >>> arr = IPArray([10, 20]) + >>> arr.to_bytes() + b'\x00\x00\...x00\x02' + """ + return self.data.tobytes() + + # ------------------------------------------------------------------------ + # Ops + # ------------------------------------------------------------------------ + def __eq__(self, other): # TDOO: scalar ipaddress if not isinstance(other, IPArray): @@ -293,69 +261,6 @@ def isna(self): ips = self.data return (ips['lo'] == 0) & (ips['hi'] == 0) - def argsort(self, axis=-1, kind='quicksort', order=None): - return self.data.argsort() - - @property - def is_ipv4(self): - # TODO: NA should be NA - ips = self.data - return (ips['hi'] == 0) & (ips['lo'] < _U8_MAX) - - @property - def is_ipv6(self): - ips = self.data - return (ips['hi'] > 0) | (ips['lo'] > _U8_MAX) - - @property - def version(self): - return np.where(self.is_ipv4, 4, 6) - - @property - def is_multicast(self): - pyips = self.to_pyipaddress() - return np.array([ip.is_multicast for ip in pyips]) - - @property - def is_private(self): - pyips = self.to_pyipaddress() - return np.array([ip.is_private for ip in pyips]) - - @property - def is_global(self): - pyips = self.to_pyipaddress() - return np.array([ip.is_global for ip in pyips]) - - @property - def is_unspecified(self): - pyips = self.to_pyipaddress() - return np.array([ip.is_unspecified for ip in pyips]) - - @property - def is_reserved(self): - pyips = self.to_pyipaddress() - return np.array([ip.is_reserved for ip in pyips]) - - @property - def is_loopback(self): - pyips = self.to_pyipaddress() - return np.array([ip.is_loopback for ip in pyips]) - - @property - def is_link_local(self): - pyips = self.to_pyipaddress() - return np.array([ip.is_link_local for ip in pyips]) - - @property - def packed(self): - """Bytestring of the IP addresses - - Each address takes 16 bytes. IPv4 addresses are prefixed - by zeros. - """ - # TODO: I wonder if that should be post-fixed by 0s. - return self.data.tobytes() - def isin(self, other): """Check whether elements of 'self' are in 'other'. @@ -445,127 +350,69 @@ def _isin_addresses(self, other): # TODO(factorize): replace this return isin(self, other) - def setitem(self, indexer, value): - """Set the 'value' inplace. - """ - # I think having a separate than __setitem__ is good - # since we have to return here, but __setitem__ doesn't. - self[indexer] = value - return self + # ------------------------------------------------------------------------ + # IP Specific + # ------------------------------------------------------------------------ @property - def index_type(self): - return IPAddressIndex - - def unique(self): - # type: () -> ExtensionArray - # https://github.com/pandas-dev/pandas/pull/19869 - _, indices = np.unique(self.data, return_index=True) - data = self.data.take(np.sort(indices)) - return self._from_ndarray(data) - - def factorize(self, na_sentinel=-1): - """Factorize an IPArray into integer labels and unique values. - - Calling :meth:`pandas.Series.factorize` or :meth:`pandas.factorize` - will dispatch to this method. + def is_ipv4(self): + # TODO: NA should be NA + ips = self.data + return (ips['hi'] == 0) & (ips['lo'] < _U8_MAX) - Parameters - ---------- - na_sentinel : int, default -1 - The value in `labels` to use for indicating missing values in - `self`. + @property + def is_ipv6(self): + ips = self.data + return (ips['hi'] > 0) | (ips['lo'] > _U8_MAX) - Returns - ------- - labels : ndarray - An integer-type ndarray the same length as `self`. Each newly- - observed value in `self` will be assigned the next integer. - Missing values in self are assigned `na_sentinel`. - uniques : IPArray - The unique values in `self` in order of appereance, not including - the missing value ``IPv4Address('0.0.0.0')``. + @property + def version(self): + return np.where(self.is_ipv4, 4, 6) - See Also - -------- - pandas.factorize, pandas.Series.factorize + @property + def is_multicast(self): + pyips = self.to_pyipaddress() + return np.array([ip.is_multicast for ip in pyips]) - Examples - -------- - >>> arr = IPArray([2, 2, 0, 1, 2, 2**64 + 1]) - >>> arr - IPArray(['0.0.0.2', '0.0.0.2', '0.0.0.0', '0.0.0.1', - '0.0.0.2', '::1:0:0:0:1']) - - >>> labels, uniques = arr.factorize() - >>> labels - array([ 0, 0, -1, 1, 0, 2]) - - Notice that `uniques` does not include the missing value. - >>> uniques - IPArray(['0.0.0.2', '0.0.0.1', '::1:0:0:0:1']) - """ - # OK, so here's the plan. - # Start with factorizing `self.data`, which has two unfortunate issues - # 1. Requires casting to object. - # 2. Gets the NA logic wrong, since (0, 0) isn't NA to pandas. - # For now, we can't help with 1. Maybe someday. - # For 2, we can "fix" things with a little post-factorization cleanup. - l, u = pd.factorize(self.data) - mask = self.isna() - any_na = mask.any() - - if any_na: - first_na = mask.argmax() - refactorize(l, first_na, na_sentinel=na_sentinel) # inplace op - - # u is an ndarray of tuples. Go to our record type, then an IPArray - u2 = type(self)((u.astype(self.dtype._record_type))) - # May have a missing value. - if any_na: - u2 = u2[~u2.isna()] - return l, u2 - - -# ----- -# Index -# ----- - -class IPAddressIndex(pd.Index): - _typ = 'ipaddressindex' - _attributes = ['name'] - _holder = IPArray - - def __new__(cls, data=None, name=None): - from .parser import _to_ip_array + @property + def is_private(self): + pyips = self.to_pyipaddress() + return np.array([ip.is_private for ip in pyips]) - if data is None: - data = [] + @property + def is_global(self): + pyips = self.to_pyipaddress() + return np.array([ip.is_global for ip in pyips]) - data = _to_ip_array(data) - return cls._simple_new(data, name=name) + @property + def is_unspecified(self): + pyips = self.to_pyipaddress() + return np.array([ip.is_unspecified for ip in pyips]) - @classmethod - def _simple_new(cls, data, name=None): - result = object.__new__(cls) - values = cls._holder(data) - result._data = values - result._name = name - result._reset_identity() - return result + @property + def is_reserved(self): + pyips = self.to_pyipaddress() + return np.array([ip.is_reserved for ip in pyips]) - def __repr__(self): - tpl = 'IPAddressIndex({})' - return tpl.format(self._data._format_values()) + @property + def is_loopback(self): + pyips = self.to_pyipaddress() + return np.array([ip.is_loopback for ip in pyips]) @property - def inferred_type(self): - return self._typ + def is_link_local(self): + pyips = self.to_pyipaddress() + return np.array([ip.is_link_local for ip in pyips]) @property - def values(self): - return self._data + def packed(self): + """Bytestring of the IP addresses + Each address takes 16 bytes. IPv4 addresses are prefixed + by zeros. + """ + # TODO: I wonder if that should be post-fixed by 0s. + return self.data.tobytes() # ----------------------------------------------------------------------------- # Accessor From a7a4c6f9d8629043b250434089d0cd94cf91490f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 15 Mar 2018 13:53:30 -0500 Subject: [PATCH 2/6] TST: Reorganize module --- tests/ip/__init__.py | 0 {cyberpandas => tests/ip}/test_dtypes.py | 0 {cyberpandas => tests/ip}/test_interface.py | 0 {cyberpandas => tests/ip}/test_ip.py | 0 {cyberpandas => tests/ip}/test_ip_pandas.py | 26 ------------------- .../ip}/test_pandas_methods.py | 0 {cyberpandas => tests/ip}/test_parser.py | 0 7 files changed, 26 deletions(-) create mode 100644 tests/ip/__init__.py rename {cyberpandas => tests/ip}/test_dtypes.py (100%) rename {cyberpandas => tests/ip}/test_interface.py (100%) rename {cyberpandas => tests/ip}/test_ip.py (100%) rename {cyberpandas => tests/ip}/test_ip_pandas.py (82%) rename {cyberpandas => tests/ip}/test_pandas_methods.py (100%) rename {cyberpandas => tests/ip}/test_parser.py (100%) diff --git a/tests/ip/__init__.py b/tests/ip/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cyberpandas/test_dtypes.py b/tests/ip/test_dtypes.py similarity index 100% rename from cyberpandas/test_dtypes.py rename to tests/ip/test_dtypes.py diff --git a/cyberpandas/test_interface.py b/tests/ip/test_interface.py similarity index 100% rename from cyberpandas/test_interface.py rename to tests/ip/test_interface.py diff --git a/cyberpandas/test_ip.py b/tests/ip/test_ip.py similarity index 100% rename from cyberpandas/test_ip.py rename to tests/ip/test_ip.py diff --git a/cyberpandas/test_ip_pandas.py b/tests/ip/test_ip_pandas.py similarity index 82% rename from cyberpandas/test_ip_pandas.py rename to tests/ip/test_ip_pandas.py index 9065f65..757fba5 100644 --- a/cyberpandas/test_ip_pandas.py +++ b/tests/ip/test_ip_pandas.py @@ -66,24 +66,6 @@ def test_dataframe_from_series(): assert isinstance(result.dtypes['A'], ip.IPType) -def test_index_constructor(): - result = ip.IPAddressIndex([0, 1, 2]) - assert isinstance(result, ip.IPAddressIndex) - assert result._data.equals(ip.IPArray([0, 1, 2])) - if six.PY2: - assert repr(result) == ("IPAddressIndex([u'0.0.0.0', u'0.0.0.1', " - "u'0.0.0.2'])") - else: - assert repr(result) == ("IPAddressIndex(['0.0.0.0', '0.0.0.1', " - "'0.0.0.2'])") - - -@pytest.mark.xfail(reason="ExtensionIndex not implemented") -def test_series_with_index(): - ser = pd.Series([1, 2, 3], index=ip.IPAddressIndex([0, 1, 2])) - repr(ser) - - def test_getitem_scalar(): ser = pd.Series(ip.IPArray([0, 1, 2])) result = ser[1] @@ -109,14 +91,6 @@ def test_setitem_scalar(): # -------------- -@pytest.mark.xfail(reason="upstream") -def test_value_counts(): - result = pd.Series(ip.IPArray([1, 1, 2, 3, 3, 3])).value_counts() - expected = pd.Series([3, 2, 1], - index=ip.IPAddressIndex([3, 1, 2])) - tm.assert_series_equal(result, expected) - - @given(lists(integers(min_value=1, max_value=2**128 - 1))) def test_argsort(ints): pass diff --git a/cyberpandas/test_pandas_methods.py b/tests/ip/test_pandas_methods.py similarity index 100% rename from cyberpandas/test_pandas_methods.py rename to tests/ip/test_pandas_methods.py diff --git a/cyberpandas/test_parser.py b/tests/ip/test_parser.py similarity index 100% rename from cyberpandas/test_parser.py rename to tests/ip/test_parser.py From 21fbcfffd43add66f6aa61e109006176cc0fde76 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 15 Mar 2018 13:53:50 -0500 Subject: [PATCH 3/6] ENH: Added MAC address --- cyberpandas/mac_array.py | 126 ++++++++++++++++++++++++++++++++++++ tests/__init__.py | 0 tests/mac/__init__.py | 0 tests/mac/test_interface.py | 97 +++++++++++++++++++++++++++ 4 files changed, 223 insertions(+) create mode 100644 cyberpandas/mac_array.py create mode 100644 tests/__init__.py create mode 100644 tests/mac/__init__.py create mode 100644 tests/mac/test_interface.py diff --git a/cyberpandas/mac_array.py b/cyberpandas/mac_array.py new file mode 100644 index 0000000..3f4da37 --- /dev/null +++ b/cyberpandas/mac_array.py @@ -0,0 +1,126 @@ +import numpy as np + +from pandas.core.dtypes.dtypes import ExtensionDtype + +from .base import NumPyBackedExtensionArrayMixin + + +class MACType(ExtensionDtype): + """Dtype for MAC Address Data.""" + name = 'mac' + type = int + kind = 'u' + na_value = 0 # TODO: Check this. + + @classmethod + def construct_from_string(cls, string): + if string == cls.name: + return cls() + else: + raise TypeError("Cannot construct a '{}' from " + "'{}'".format(cls, string)) + + +class MACArray(NumPyBackedExtensionArrayMixin): + """Array for MAC Address data. + + * https://en.wikipedia.org/wiki/MAC_address + * https://tools.ietf.org/html/rfc5342 + """ + # What type(s) do we support? + # MAC-48 or EUI-64? + _dtype = MACType() + _itemsize = 8 + ndim = 1 + can_hold_na = True + + def __init__(self, values, copy=True): + # TODO: parse hex / strings + self.data = np.array(values, dtype='uint64', copy=copy) + + @classmethod + def _from_ndarray(cls, data, copy=False): + return cls(data, copy=copy) + + @property + def na_value(self): + return self.dtype.na_value + + def __repr__(self): + formatted = self._format_values() + return "MACArray({!r})".format(formatted) + + def _format_values(self): + return [_format(x) for x in self.data] + + @staticmethod + def _box_scalar(scalar): + return scalar + + def __setitem__(self, key, value): + from .parser import to_ipaddress + + value = to_ipaddress(value).data + self.data[key] = value + + def __iter__(self): + return iter(self.data.tolist()) + + def __lt__(self, other): + return self.data < other + + def __le__(self, other): + return self.data <= other + + def __eq__(self, other): + return self.data == other + + def __ge__(self, other): + return other <= self + + def __gt__(self, other): + return other < self + + def equals(self, other): + if not isinstance(other, type(self)): + raise TypeError + return (self.data == other.data).all() + + def isna(self): + return (self.data == 0) + + @property + def _parser(self): + return lambda x: x + + def take(self, indexer, allow_fill=True, fill_value=None): + mask = indexer == -1 + result = self.data.take(indexer) + result[mask] = self.dtype.na_value + return type(self)(result, copy=False) + + def _formatting_values(self): + return np.array(self._format_values(), dtype='object') + + @classmethod + def _concat_same_type(cls, to_concat): + return cls(np.concatenate([array.data for array in to_concat])) + + def take_nd(self, indexer, allow_fill=True, fill_value=None): + return self.take(indexer, allow_fill=allow_fill, fill_value=fill_value) + + def copy(self, deep=False): + return type(self)(self.data.copy()) + + +def _format(mac): + # https://stackoverflow.com/a/36883363/1889400 + mac_hex = "{:012x}".format(mac) + mac_str = ":".join(mac_hex[i:i+2] for i in range(0, len(mac_hex), 2)) + return mac_str + + +def _parse(mac): + # https://stackoverflow.com/a/36883363/1889400 + mac_int = int(mac.replace(":", "").replace("-", ""), 16) + return mac_int diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/mac/__init__.py b/tests/mac/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/mac/test_interface.py b/tests/mac/test_interface.py new file mode 100644 index 0000000..d3e2892 --- /dev/null +++ b/tests/mac/test_interface.py @@ -0,0 +1,97 @@ +import pytest + +from pandas.tests.extension import base + +from cyberpandas.mac_array import MACArray, MACType + + + +@pytest.fixture +def dtype(): + return MACType() + + +@pytest.fixture +def data(): + return MACArray(list(range(100))) + + +@pytest.fixture +def data_missing(): + return MACArray([0, 1]) + + +@pytest.fixture(params=['data', 'data_missing']) +def all_data(request, data, data_missing): + """Parametrized fixture giving 'data' and 'data_missing'""" + if request.param == 'data': + return data + elif request.param == 'data_missing': + return data_missing + + +@pytest.fixture +def data_for_sorting(): + return MACArray([10, 2 ** 64 + 1, 1]) + + +@pytest.fixture +def data_missing_for_sorting(): + return MACArray([2 ** 64 + 1, 0, 1]) + + +@pytest.fixture +def data_for_grouping(): + b = 1 + a = 2 ** 32 + 1 + c = 2 ** 32 + 10 + return MACArray([ + b, b, 0, 0, a, a, b, c + ]) + + +@pytest.fixture +def na_cmp(): + """Binary operator for comparing NA values. + + Should return a function of two arguments that returns + True if both arguments are (scalar) NA for your type. + + By defult, uses ``operator.or`` + """ + return lambda x, y: int(x) == int(y) == 0 + + +@pytest.fixture +def na_value(): + return MACType.na_value + + +class TestDtype(base.BaseDtypeTests): + pass + + +class TestInterface(base.BaseInterfaceTests): + pass + + +class TestConstructors(base.BaseConstructorsTests): + pass + + +class TestReshaping(base.BaseReshapingTests): + pass + + +class TestGetitem(base.BaseGetitemTests): + pass + + +class TestMissing(base.BaseMissingTests): + pass + + +class TestMethods(base.BaseMethodsTests): + @pytest.mark.xfail(reason='upstream') + def test_value_counts(data, dropna): + pass From d23fe9006ac69618662f2d250d463c6af3ca148f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 15 Mar 2018 13:55:07 -0500 Subject: [PATCH 4/6] Linting --- cyberpandas/ip_array.py | 4 ++-- tests/ip/test_ip_pandas.py | 1 - tests/mac/test_interface.py | 1 - 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/cyberpandas/ip_array.py b/cyberpandas/ip_array.py index baf592e..682c5dc 100644 --- a/cyberpandas/ip_array.py +++ b/cyberpandas/ip_array.py @@ -1,7 +1,6 @@ import abc import collections import ipaddress -import operator import six import numpy as np @@ -11,7 +10,7 @@ from ._accessor import (DelegatedMethod, DelegatedProperty, delegated_method) -from ._utils import combine, pack, unpack, refactorize +from ._utils import combine, pack, unpack from .base import NumPyBackedExtensionArrayMixin from .common import _U8_MAX, _IPv4_MAX from .parser import _to_ipaddress_pyint, _as_ip_object @@ -414,6 +413,7 @@ def packed(self): # TODO: I wonder if that should be post-fixed by 0s. return self.data.tobytes() + # ----------------------------------------------------------------------------- # Accessor # ----------------------------------------------------------------------------- diff --git a/tests/ip/test_ip_pandas.py b/tests/ip/test_ip_pandas.py index 757fba5..16bbb87 100644 --- a/tests/ip/test_ip_pandas.py +++ b/tests/ip/test_ip_pandas.py @@ -2,7 +2,6 @@ """ import ipaddress -import six import pytest import numpy as np from hypothesis.strategies import integers, lists diff --git a/tests/mac/test_interface.py b/tests/mac/test_interface.py index d3e2892..8211a32 100644 --- a/tests/mac/test_interface.py +++ b/tests/mac/test_interface.py @@ -5,7 +5,6 @@ from cyberpandas.mac_array import MACArray, MACType - @pytest.fixture def dtype(): return MACType() From 9be182636f3ea8ba17d945826bc923a47bc28011 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 15 Mar 2018 14:16:37 -0500 Subject: [PATCH 5/6] Update test commands --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 61c3cbf..3cb3a5b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,8 +21,8 @@ install: script: - echo "script start" - source activate test-environment - - pytest cyberpandas - - flake8 cyberpandas + - pytest + - flake8 - source ./ci/build.sh after_success: From 9ca0f3395ddeeec1ff818f06a4f29f5bac0f1542 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Mar 2018 11:28:46 -0500 Subject: [PATCH 6/6] COMPAT: long for py2 --- cyberpandas/mac_array.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cyberpandas/mac_array.py b/cyberpandas/mac_array.py index 3f4da37..d753839 100644 --- a/cyberpandas/mac_array.py +++ b/cyberpandas/mac_array.py @@ -1,4 +1,5 @@ import numpy as np +import six from pandas.core.dtypes.dtypes import ExtensionDtype @@ -8,7 +9,8 @@ class MACType(ExtensionDtype): """Dtype for MAC Address Data.""" name = 'mac' - type = int + # type is long for Py2 and int for py3 + type = six.integer_types[-1] kind = 'u' na_value = 0 # TODO: Check this.