From fad69fbe3f7b1b8788a170faf544efca1ffe4ec6 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 3 Nov 2021 14:56:03 -0500 Subject: [PATCH 1/9] feat: support conversion from pyarrow RecordBatch to pandas DataFrame --- db_dtypes/core.py | 3 +- tests/unit/test_arrow.py | 294 +++++++++++++++++++++------------------ 2 files changed, 157 insertions(+), 140 deletions(-) diff --git a/db_dtypes/core.py b/db_dtypes/core.py index fbc784e..7f6da0f 100644 --- a/db_dtypes/core.py +++ b/db_dtypes/core.py @@ -17,6 +17,7 @@ import numpy import pandas from pandas._libs import NaT +import pandas.api.extensions import pandas.compat.numpy.function import pandas.core.algorithms import pandas.core.arrays @@ -32,7 +33,7 @@ pandas_release = pandas_backports.pandas_release -class BaseDatetimeDtype(pandas.core.dtypes.base.ExtensionDtype): +class BaseDatetimeDtype(pandas.api.extensions.ExtensionDtype): na_value = NaT kind = "o" names = None diff --git a/tests/unit/test_arrow.py b/tests/unit/test_arrow.py index d3745ea..82cf60d 100644 --- a/tests/unit/test_arrow.py +++ b/tests/unit/test_arrow.py @@ -13,160 +13,176 @@ # limitations under the License. import datetime as dt +from typing import Optional import pandas +import pandas.api.extensions +import pandas.testing import pyarrow import pytest -# To register the types. -import db_dtypes # noqa +import db_dtypes -@pytest.mark.parametrize( - ("series", "expected"), +def types_mapper( + pyarrow_type: pyarrow.DataType, +) -> Optional[pandas.api.extensions.ExtensionDtype]: + type_str = str(pyarrow_type) + + if type_str.startswith("date32") or type_str.startswith("date64"): + return db_dtypes.DateDtype + elif type_str.startswith("time32") or type_str.startswith("time64"): + return db_dtypes.TimeDtype + else: + # Use default type mapping. + return None + + +SERIES_ARRAYS_DEFAULT_TYPES = [ + (pandas.Series([], dtype="dbdate"), pyarrow.array([], type=pyarrow.date32())), ( - (pandas.Series([], dtype="dbdate"), pyarrow.array([], type=pyarrow.date32())), - ( - pandas.Series([None, None, None], dtype="dbdate"), - pyarrow.array([None, None, None], type=pyarrow.date32()), - ), - ( - pandas.Series( - [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], dtype="dbdate" - ), - pyarrow.array( - [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], - type=pyarrow.date32(), - ), - ), - ( - pandas.Series( - [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], - dtype="dbdate", - ), - pyarrow.array( - [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], - type=pyarrow.date32(), - ), - ), - ( - pandas.Series([], dtype="dbtime"), - pyarrow.array([], type=pyarrow.time64("ns")), - ), - ( - pandas.Series([None, None, None], dtype="dbtime"), - pyarrow.array([None, None, None], type=pyarrow.time64("ns")), - ), - ( - pandas.Series( - [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], - dtype="dbtime", - ), - pyarrow.array( - [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], - type=pyarrow.time64("ns"), - ), - ), - ( - pandas.Series( - [ - dt.time(0, 0, 0, 0), - dt.time(12, 30, 15, 125_000), - dt.time(23, 59, 59, 999_999), - ], - dtype="dbtime", - ), - pyarrow.array( - [ - dt.time(0, 0, 0, 0), - dt.time(12, 30, 15, 125_000), - dt.time(23, 59, 59, 999_999), - ], - type=pyarrow.time64("ns"), - ), + pandas.Series([None, None, None], dtype="dbdate"), + pyarrow.array([None, None, None], type=pyarrow.date32()), + ), + ( + pandas.Series( + [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], dtype="dbdate" + ), + pyarrow.array( + [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], type=pyarrow.date32(), ), ), -) + ( + pandas.Series( + [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], + dtype="dbdate", + ), + pyarrow.array( + [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], + type=pyarrow.date32(), + ), + ), + (pandas.Series([], dtype="dbtime"), pyarrow.array([], type=pyarrow.time64("ns")),), + ( + pandas.Series([None, None, None], dtype="dbtime"), + pyarrow.array([None, None, None], type=pyarrow.time64("ns")), + ), + ( + pandas.Series( + [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], dtype="dbtime", + ), + pyarrow.array( + [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], + type=pyarrow.time64("ns"), + ), + ), + ( + pandas.Series( + [ + dt.time(0, 0, 0, 0), + dt.time(12, 30, 15, 125_000), + dt.time(23, 59, 59, 999_999), + ], + dtype="dbtime", + ), + pyarrow.array( + [ + dt.time(0, 0, 0, 0), + dt.time(12, 30, 15, 125_000), + dt.time(23, 59, 59, 999_999), + ], + type=pyarrow.time64("ns"), + ), + ), +] +SERIES_ARRAYS_CUSTOM_ARROW_TYPES = [ + (pandas.Series([], dtype="dbdate"), pyarrow.array([], type=pyarrow.date64())), + ( + pandas.Series([None, None, None], dtype="dbdate"), + pyarrow.array([None, None, None], type=pyarrow.date64()), + ), + ( + pandas.Series( + [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], dtype="dbdate" + ), + pyarrow.array( + [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], type=pyarrow.date64(), + ), + ), + ( + pandas.Series( + [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], + dtype="dbdate", + ), + pyarrow.array( + [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], + type=pyarrow.date64(), + ), + ), + (pandas.Series([], dtype="dbtime"), pyarrow.array([], type=pyarrow.time32("ms")),), + ( + pandas.Series([None, None, None], dtype="dbtime"), + pyarrow.array([None, None, None], type=pyarrow.time32("ms")), + ), + ( + pandas.Series( + [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_000)], dtype="dbtime", + ), + pyarrow.array( + [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_000)], + type=pyarrow.time32("ms"), + ), + ), + ( + pandas.Series( + [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], dtype="dbtime", + ), + pyarrow.array( + [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], + type=pyarrow.time64("us"), + ), + ), + ( + pandas.Series( + [ + dt.time(0, 0, 0, 0), + dt.time(12, 30, 15, 125_000), + dt.time(23, 59, 59, 999_999), + ], + dtype="dbtime", + ), + pyarrow.array( + [ + dt.time(0, 0, 0, 0), + dt.time(12, 30, 15, 125_000), + dt.time(23, 59, 59, 999_999), + ], + type=pyarrow.time64("us"), + ), + ), +] + + +@pytest.mark.parametrize(("series", "expected"), SERIES_ARRAYS_DEFAULT_TYPES) def test_to_arrow(series, expected): array = pyarrow.array(series) assert array.equals(expected) -@pytest.mark.parametrize( - ("series", "expected"), - ( - (pandas.Series([], dtype="dbdate"), pyarrow.array([], type=pyarrow.date64())), - ( - pandas.Series([None, None, None], dtype="dbdate"), - pyarrow.array([None, None, None], type=pyarrow.date64()), - ), - ( - pandas.Series( - [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], dtype="dbdate" - ), - pyarrow.array( - [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], - type=pyarrow.date64(), - ), - ), - ( - pandas.Series( - [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], - dtype="dbdate", - ), - pyarrow.array( - [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], - type=pyarrow.date64(), - ), - ), - ( - pandas.Series([], dtype="dbtime"), - pyarrow.array([], type=pyarrow.time32("ms")), - ), - ( - pandas.Series([None, None, None], dtype="dbtime"), - pyarrow.array([None, None, None], type=pyarrow.time32("ms")), - ), - ( - pandas.Series( - [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_000)], - dtype="dbtime", - ), - pyarrow.array( - [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_000)], - type=pyarrow.time32("ms"), - ), - ), - ( - pandas.Series( - [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], - dtype="dbtime", - ), - pyarrow.array( - [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], - type=pyarrow.time64("us"), - ), - ), - ( - pandas.Series( - [ - dt.time(0, 0, 0, 0), - dt.time(12, 30, 15, 125_000), - dt.time(23, 59, 59, 999_999), - ], - dtype="dbtime", - ), - pyarrow.array( - [ - dt.time(0, 0, 0, 0), - dt.time(12, 30, 15, 125_000), - dt.time(23, 59, 59, 999_999), - ], - type=pyarrow.time64("us"), - ), - ), - ), -) +@pytest.mark.parametrize(("series", "expected"), SERIES_ARRAYS_CUSTOM_ARROW_TYPES) def test_to_arrow_w_arrow_type(series, expected): array = pyarrow.array(series, type=expected.type) assert array.equals(expected) + + +@pytest.mark.parametrize( + ["expected", "pyarrow_array"], + SERIES_ARRAYS_DEFAULT_TYPES + SERIES_ARRAYS_CUSTOM_ARROW_TYPES, +) +def test_from_arrow(pyarrow_array: pyarrow.Array, expected: pandas.Series): + # Convert to RecordBatch because types_mapper argument is ignored when + # using a pyarrow.Array. https://issues.apache.org/jira/browse/ARROW-9664 + record_batch = pyarrow.RecordBatch.from_arrays([pyarrow_array], ["test_col"]) + dataframe = record_batch.to_pandas(date_as_object=False, types_mapper=types_mapper) + series = dataframe["test_col"] + pandas.testing.assert_extension_array_equal(series, expected) From 7b95de3056f87cef42a5b9a788eee5e249555a08 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 3 Nov 2021 16:06:28 -0500 Subject: [PATCH 2/9] hack together working implementation TODO: add tests for constructing pandas Series with pyarrow scalars --- db_dtypes/__init__.py | 31 +++++++++++++++++++++++++++++-- db_dtypes/core.py | 5 +---- tests/unit/test_arrow.py | 2 +- 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py index bce2bf0..219fd9f 100644 --- a/db_dtypes/__init__.py +++ b/db_dtypes/__init__.py @@ -17,6 +17,7 @@ import datetime import re +from typing import Union import numpy import packaging.version @@ -52,6 +53,13 @@ class TimeDtype(core.BaseDatetimeDtype): def construct_array_type(self): return TimeArray + @staticmethod + def __from_arrow__( + array: Union[pyarrow.Array, pyarrow.ChunkedArray] + ) -> "TimeArray": + # TODO: Consider a more efficient conversion to "M8[ns]" numpy array. + return TimeArray(array) + class TimeArray(core.BaseDatetimeArray): """ @@ -75,7 +83,13 @@ def _datetime( r"(?:\.(?P\d*))?)?)?\s*$" ).match, ): - if isinstance(scalar, datetime.time): + # Convert pyarrow values to datetime.time. + if isinstance(scalar, (pyarrow.Time32Scalar, pyarrow.Time64Scalar)): + scalar = scalar.as_py() + + if scalar is None: + return None + elif isinstance(scalar, datetime.time): return datetime.datetime.combine(cls._epoch, scalar) elif isinstance(scalar, str): # iso string @@ -146,6 +160,13 @@ class DateDtype(core.BaseDatetimeDtype): def construct_array_type(self): return DateArray + @staticmethod + def __from_arrow__( + array: Union[pyarrow.Array, pyarrow.ChunkedArray] + ) -> "DateArray": + # TODO: Consider a more efficient conversion to "M8[ns]" numpy array. + return DateArray(array) + class DateArray(core.BaseDatetimeArray): """ @@ -161,7 +182,13 @@ def _datetime( scalar, match_fn=re.compile(r"\s*(?P\d+)-(?P\d+)-(?P\d+)\s*$").match, ): - if isinstance(scalar, datetime.date): + # Convert pyarrow values to datetime.date. + if isinstance(scalar, (pyarrow.Date32Scalar, pyarrow.Date64Scalar)): + scalar = scalar.as_py() + + if scalar is None: + return None + elif isinstance(scalar, datetime.date): return datetime.datetime(scalar.year, scalar.month, scalar.day) elif isinstance(scalar, str): match = match_fn(scalar) diff --git a/db_dtypes/core.py b/db_dtypes/core.py index 7f6da0f..c8f3ad4 100644 --- a/db_dtypes/core.py +++ b/db_dtypes/core.py @@ -61,10 +61,7 @@ def __init__(self, values, dtype=None, copy: bool = False): @classmethod def __ndarray(cls, scalars): - return numpy.array( - [None if scalar is None else cls._datetime(scalar) for scalar in scalars], - "M8[ns]", - ) + return numpy.array([cls._datetime(scalar) for scalar in scalars], "M8[ns]",) @classmethod def _from_sequence(cls, scalars, *, dtype=None, copy=False): diff --git a/tests/unit/test_arrow.py b/tests/unit/test_arrow.py index 82cf60d..4bac5b3 100644 --- a/tests/unit/test_arrow.py +++ b/tests/unit/test_arrow.py @@ -185,4 +185,4 @@ def test_from_arrow(pyarrow_array: pyarrow.Array, expected: pandas.Series): record_batch = pyarrow.RecordBatch.from_arrays([pyarrow_array], ["test_col"]) dataframe = record_batch.to_pandas(date_as_object=False, types_mapper=types_mapper) series = dataframe["test_col"] - pandas.testing.assert_extension_array_equal(series, expected) + pandas.testing.assert_series_equal(series, expected, check_names=False) From a009fc4cefa798655c5de087bb1e841b5aaa1979 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 4 Nov 2021 10:25:28 -0500 Subject: [PATCH 3/9] fix unit test coverage, optimize arrow to numpy conversion --- db_dtypes/__init__.py | 34 +++++++++++++++++------ tests/unit/test_arrow.py | 60 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 84 insertions(+), 10 deletions(-) diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py index 219fd9f..acaa577 100644 --- a/db_dtypes/__init__.py +++ b/db_dtypes/__init__.py @@ -30,6 +30,7 @@ import pandas.core.dtypes.generic import pandas.core.nanops import pyarrow +import pyarrow.compute from db_dtypes.version import __version__ from db_dtypes import core @@ -37,6 +38,9 @@ date_dtype_name = "dbdate" time_dtype_name = "dbtime" +_EPOCH = datetime.datetime(1970, 1, 1) +_NPEPOCH = numpy.datetime64(_EPOCH) +_PAEPOCH = pyarrow.scalar(_EPOCH, pyarrow.timestamp("ns")) pandas_release = packaging.version.parse(pandas.__version__).release @@ -57,8 +61,21 @@ def construct_array_type(self): def __from_arrow__( array: Union[pyarrow.Array, pyarrow.ChunkedArray] ) -> "TimeArray": - # TODO: Consider a more efficient conversion to "M8[ns]" numpy array. - return TimeArray(array) + # We can't call combine_chunks on an empty array, so short-circuit the + # rest of the function logic for this special case. + if len(array) == 0: + return TimeArray(numpy.array([], dtype="datetime64[ns]")) + + # We can't cast to timestamp("ns"), but we time64("ns") has the same + # memory layout: 64-bit integers representing the number of nanoseconds + # since the datetime epoch (midnight 1970-01-01). + array = pyarrow.compute.cast(array, pyarrow.time64("ns")) + + # ChunkedArray has no "view" method, so combine into an Array. + array = array.combine_chunks() if hasattr(array, "combine_chunks") else array + array = array.view(pyarrow.timestamp("ns")) + np_array = array.to_numpy(zero_copy_only=False) + return TimeArray(np_array) class TimeArray(core.BaseDatetimeArray): @@ -69,8 +86,6 @@ class TimeArray(core.BaseDatetimeArray): # Data are stored as datetime64 values with a date of Jan 1, 1970 dtype = TimeDtype() - _epoch = datetime.datetime(1970, 1, 1) - _npepoch = numpy.datetime64(_epoch) @classmethod def _datetime( @@ -90,7 +105,7 @@ def _datetime( if scalar is None: return None elif isinstance(scalar, datetime.time): - return datetime.datetime.combine(cls._epoch, scalar) + return datetime.datetime.combine(_EPOCH, scalar) elif isinstance(scalar, str): # iso string parsed = match_fn(scalar) @@ -127,7 +142,7 @@ def _box_func(self, x): __return_deltas = {"timedelta", "timedelta64", "timedelta64[ns]", " "DateArray": - # TODO: Consider a more efficient conversion to "M8[ns]" numpy array. - return DateArray(array) + array = pyarrow.compute.cast(array, pyarrow.timestamp("ns")) + np_array = array.to_numpy() + return DateArray(np_array) class DateArray(core.BaseDatetimeArray): @@ -233,7 +249,7 @@ def __add__(self, other): return self.astype("object") + other if isinstance(other, TimeArray): - return (other._ndarray - other._npepoch) + self._ndarray + return (other._ndarray - _NPEPOCH) + self._ndarray return super().__add__(other) diff --git a/tests/unit/test_arrow.py b/tests/unit/test_arrow.py index 4bac5b3..66d3183 100644 --- a/tests/unit/test_arrow.py +++ b/tests/unit/test_arrow.py @@ -179,10 +179,68 @@ def test_to_arrow_w_arrow_type(series, expected): ["expected", "pyarrow_array"], SERIES_ARRAYS_DEFAULT_TYPES + SERIES_ARRAYS_CUSTOM_ARROW_TYPES, ) -def test_from_arrow(pyarrow_array: pyarrow.Array, expected: pandas.Series): +def test_series_from_arrow(pyarrow_array: pyarrow.Array, expected: pandas.Series): # Convert to RecordBatch because types_mapper argument is ignored when # using a pyarrow.Array. https://issues.apache.org/jira/browse/ARROW-9664 record_batch = pyarrow.RecordBatch.from_arrays([pyarrow_array], ["test_col"]) dataframe = record_batch.to_pandas(date_as_object=False, types_mapper=types_mapper) series = dataframe["test_col"] pandas.testing.assert_series_equal(series, expected, check_names=False) + + +@pytest.mark.parametrize( + ["expected", "pyarrow_array"], + SERIES_ARRAYS_DEFAULT_TYPES + SERIES_ARRAYS_CUSTOM_ARROW_TYPES, +) +def test_series_from_arrow_scalars( + pyarrow_array: pyarrow.Array, expected: pandas.Series +): + scalars = [] + for scalar in pyarrow_array: + scalars.append(scalar) + assert isinstance(scalar, pyarrow.Scalar) + series = pandas.Series(scalars, dtype=expected.dtype) + pandas.testing.assert_series_equal(series, expected) + + +def test_dataframe_from_arrow(): + record_batch = pyarrow.RecordBatch.from_arrays( + [ + pyarrow.array( + [dt.date(2021, 11, 4), dt.date(2038, 1, 20), None, dt.date(1970, 1, 1)], + type=pyarrow.date32(), + ), + pyarrow.array( + [ + dt.time(10, 7, 8, 995_325), + dt.time(23, 59, 59, 999_999), + None, + dt.time(0, 0, 0, 0), + ], + type=pyarrow.time64("us"), + ), + pyarrow.array([1, 2, 3, 4]), + ], + ["date_col", "time_col", "int_col"], + ) + dataframe = record_batch.to_pandas(date_as_object=False, types_mapper=types_mapper) + expected = pandas.DataFrame( + { + "date_col": pandas.Series( + [dt.date(2021, 11, 4), dt.date(2038, 1, 20), None, dt.date(1970, 1, 1)], + dtype="dbdate", + ), + "time_col": pandas.Series( + [ + dt.time(10, 7, 8, 995_325), + dt.time(23, 59, 59, 999_999), + None, + dt.time(0, 0, 0, 0), + ], + dtype="dbtime", + ), + "int_col": [1, 2, 3, 4], + }, + columns=["date_col", "time_col", "int_col"], + ) + pandas.testing.assert_frame_equal(dataframe, expected) From b4dd5cd991e0024f57fd93ae01282427c7acba79 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 4 Nov 2021 10:36:51 -0500 Subject: [PATCH 4/9] apply same optimizations to to_arrow conversion --- db_dtypes/__init__.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py index acaa577..8b47a18 100644 --- a/db_dtypes/__init__.py +++ b/db_dtypes/__init__.py @@ -40,7 +40,6 @@ time_dtype_name = "dbtime" _EPOCH = datetime.datetime(1970, 1, 1) _NPEPOCH = numpy.datetime64(_EPOCH) -_PAEPOCH = pyarrow.scalar(_EPOCH, pyarrow.timestamp("ns")) pandas_release = packaging.version.parse(pandas.__version__).release @@ -66,7 +65,7 @@ def __from_arrow__( if len(array) == 0: return TimeArray(numpy.array([], dtype="datetime64[ns]")) - # We can't cast to timestamp("ns"), but we time64("ns") has the same + # We can't cast to timestamp("ns"), but time64("ns") has the same # memory layout: 64-bit integers representing the number of nanoseconds # since the datetime epoch (midnight 1970-01-01). array = pyarrow.compute.cast(array, pyarrow.time64("ns")) @@ -157,9 +156,17 @@ def to_numpy(self, dtype="object"): return self.astype(dtype) def __arrow_array__(self, type=None): - return pyarrow.array( - self.to_numpy(dtype="object"), - type=type if type is not None else pyarrow.time64("ns"), + array = pyarrow.array(self._ndarray, type=pyarrow.timestamp("ns")) + + # ChunkedArray has no "view" method, so combine into an Array. + array = array.combine_chunks() if hasattr(array, "combine_chunks") else array + + # We can't cast to time64("ns"), but timestamp("ns") has the same + # memory layout: 64-bit integers representing the number of nanoseconds + # since the datetime epoch (midnight 1970-01-01). + array = array.view(pyarrow.time64("ns")) + return pyarrow.compute.cast( + array, type if type is not None else pyarrow.time64("ns"), ) @@ -240,8 +247,9 @@ def astype(self, dtype, copy=True): return super().astype(dtype, copy=copy) def __arrow_array__(self, type=None): - return pyarrow.array( - self._ndarray, type=type if type is not None else pyarrow.date32(), + array = pyarrow.array(self._ndarray, type=pyarrow.timestamp("ns")) + return pyarrow.compute.cast( + array, type if type is not None else pyarrow.date32(), ) def __add__(self, other): From 9c239a481ff55a38ab8f2b7519a376c1637b5905 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 4 Nov 2021 11:03:09 -0500 Subject: [PATCH 5/9] remove redundant to_numpy now that to_arrow doesn't use it --- db_dtypes/__init__.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py index 8b47a18..7852319 100644 --- a/db_dtypes/__init__.py +++ b/db_dtypes/__init__.py @@ -150,11 +150,6 @@ def astype(self, dtype, copy=True): else: return super().astype(dtype, copy=copy) - if pandas_release < (1,): - - def to_numpy(self, dtype="object"): - return self.astype(dtype) - def __arrow_array__(self, type=None): array = pyarrow.array(self._ndarray, type=pyarrow.timestamp("ns")) From fe021fd33a7cb44b1548efc50e06c34288040bbb Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 4 Nov 2021 15:28:02 -0500 Subject: [PATCH 6/9] be explicit about chunked array vs array --- db_dtypes/__init__.py | 8 ++++++-- tests/unit/test_arrow.py | 23 +++++++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py index 7852319..f4394f2 100644 --- a/db_dtypes/__init__.py +++ b/db_dtypes/__init__.py @@ -71,7 +71,9 @@ def __from_arrow__( array = pyarrow.compute.cast(array, pyarrow.time64("ns")) # ChunkedArray has no "view" method, so combine into an Array. - array = array.combine_chunks() if hasattr(array, "combine_chunks") else array + if isinstance(array, pyarrow.ChunkedArray): + array = array.combine_chunks() + array = array.view(pyarrow.timestamp("ns")) np_array = array.to_numpy(zero_copy_only=False) return TimeArray(np_array) @@ -154,7 +156,9 @@ def __arrow_array__(self, type=None): array = pyarrow.array(self._ndarray, type=pyarrow.timestamp("ns")) # ChunkedArray has no "view" method, so combine into an Array. - array = array.combine_chunks() if hasattr(array, "combine_chunks") else array + array = ( + array.combine_chunks() if isinstance(array, pyarrow.ChunkedArray) else array + ) # We can't cast to time64("ns"), but timestamp("ns") has the same # memory layout: 64-bit integers representing the number of nanoseconds diff --git a/tests/unit/test_arrow.py b/tests/unit/test_arrow.py index 66d3183..3eb4417 100644 --- a/tests/unit/test_arrow.py +++ b/tests/unit/test_arrow.py @@ -203,6 +203,29 @@ def test_series_from_arrow_scalars( pandas.testing.assert_series_equal(series, expected) +def test_dbtime_series_from_arrow_array(): + """Test to explicitly check Array -> Series conversion.""" + array = pyarrow.array([dt.time(15, 21, 0, 123_456)], type=pyarrow.time64("us")) + assert isinstance(array, pyarrow.Array) + assert not isinstance(array, pyarrow.ChunkedArray) + series = pandas.Series(db_dtypes.TimeDtype.__from_arrow__(array)) + expected = pandas.Series([dt.time(15, 21, 0, 123_456)], dtype="dbtime") + pandas.testing.assert_series_equal(series, expected) + + +def test_dbtime_series_from_arrow_chunkedarray(): + """Test to explicitly check ChunkedArray -> Series conversion.""" + array1 = pyarrow.array([dt.time(15, 21, 0, 123_456)], type=pyarrow.time64("us")) + array2 = pyarrow.array([dt.time(0, 0, 0, 0)], type=pyarrow.time64("us")) + array = pyarrow.chunked_array([array1, array2]) + assert isinstance(array, pyarrow.ChunkedArray) + series = pandas.Series(db_dtypes.TimeDtype.__from_arrow__(array)) + expected = pandas.Series( + [dt.time(15, 21, 0, 123_456), dt.time(0, 0, 0, 0)], dtype="dbtime" + ) + pandas.testing.assert_series_equal(series, expected) + + def test_dataframe_from_arrow(): record_batch = pyarrow.RecordBatch.from_arrays( [ From f042e83f210feee4d850683b74d6061953c14bee Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 8 Nov 2021 16:02:07 -0600 Subject: [PATCH 7/9] add docstrings to arrow conversion functions --- db_dtypes/__init__.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py index f4394f2..f954e09 100644 --- a/db_dtypes/__init__.py +++ b/db_dtypes/__init__.py @@ -60,6 +60,11 @@ def construct_array_type(self): def __from_arrow__( array: Union[pyarrow.Array, pyarrow.ChunkedArray] ) -> "TimeArray": + """Convert to dbtime data from an Arrow array. + + See: + https://pandas.pydata.org/pandas-docs/stable/development/extending.html#compatibility-with-apache-arrow + """ # We can't call combine_chunks on an empty array, so short-circuit the # rest of the function logic for this special case. if len(array) == 0: @@ -153,6 +158,11 @@ def astype(self, dtype, copy=True): return super().astype(dtype, copy=copy) def __arrow_array__(self, type=None): + """Convert to an Arrow array from dbtime data. + + See: + https://pandas.pydata.org/pandas-docs/stable/development/extending.html#compatibility-with-apache-arrow + """ array = pyarrow.array(self._ndarray, type=pyarrow.timestamp("ns")) # ChunkedArray has no "view" method, so combine into an Array. @@ -185,6 +195,11 @@ def construct_array_type(self): def __from_arrow__( array: Union[pyarrow.Array, pyarrow.ChunkedArray] ) -> "DateArray": + """Convert to dbdate data from an Arrow array. + + See: + https://pandas.pydata.org/pandas-docs/stable/development/extending.html#compatibility-with-apache-arrow + """ array = pyarrow.compute.cast(array, pyarrow.timestamp("ns")) np_array = array.to_numpy() return DateArray(np_array) @@ -246,6 +261,11 @@ def astype(self, dtype, copy=True): return super().astype(dtype, copy=copy) def __arrow_array__(self, type=None): + """Convert to an Arrow array from dbdate data. + + See: + https://pandas.pydata.org/pandas-docs/stable/development/extending.html#compatibility-with-apache-arrow + """ array = pyarrow.array(self._ndarray, type=pyarrow.timestamp("ns")) return pyarrow.compute.cast( array, type if type is not None else pyarrow.date32(), From 5480c11d1a9a6e91f3da0056f2c9d1323ede217d Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 8 Nov 2021 16:36:07 -0600 Subject: [PATCH 8/9] add test case for round-trip to/from pyarrow nanosecond-precision time scalars --- db_dtypes/__init__.py | 9 ++++++++- tests/unit/test_arrow.py | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py index f954e09..f1424fb 100644 --- a/db_dtypes/__init__.py +++ b/db_dtypes/__init__.py @@ -106,12 +106,19 @@ def _datetime( ): # Convert pyarrow values to datetime.time. if isinstance(scalar, (pyarrow.Time32Scalar, pyarrow.Time64Scalar)): - scalar = scalar.as_py() + scalar = ( + scalar.cast(pyarrow.time64("ns")) + .cast(pyarrow.int64()) + .cast(pyarrow.timestamp("ns")) + .as_py() + ) if scalar is None: return None elif isinstance(scalar, datetime.time): return datetime.datetime.combine(_EPOCH, scalar) + elif isinstance(scalar, pandas.Timestamp): + return scalar.to_datetime64() elif isinstance(scalar, str): # iso string parsed = match_fn(scalar) diff --git a/tests/unit/test_arrow.py b/tests/unit/test_arrow.py index 3eb4417..2bd7a5a 100644 --- a/tests/unit/test_arrow.py +++ b/tests/unit/test_arrow.py @@ -24,6 +24,11 @@ import db_dtypes +SECOND_NANOS = 1_000_000_000 +MINUTE_NANOS = 60 * SECOND_NANOS +HOUR_NANOS = 60 * MINUTE_NANOS + + def types_mapper( pyarrow_type: pyarrow.DataType, ) -> Optional[pandas.api.extensions.ExtensionDtype]: @@ -160,6 +165,40 @@ def types_mapper( type=pyarrow.time64("us"), ), ), + ( + pandas.Series( + [ + # Only microseconds are supported when reading data. See: + # https://github.com/googleapis/python-db-dtypes-pandas/issues/19 + # Still, round-trip with pyarrow nanosecond precision scalars + # is supported. + pyarrow.scalar(0, pyarrow.time64("ns")), + pyarrow.scalar( + 12 * HOUR_NANOS + + 30 * MINUTE_NANOS + + 15 * SECOND_NANOS + + 123_456_789, + pyarrow.time64("ns"), + ), + pyarrow.scalar( + 23 * HOUR_NANOS + + 59 * MINUTE_NANOS + + 59 * SECOND_NANOS + + 999_999_999, + pyarrow.time64("ns"), + ), + ], + dtype="dbtime", + ), + pyarrow.array( + [ + 0, + 12 * HOUR_NANOS + 30 * MINUTE_NANOS + 15 * SECOND_NANOS + 123_456_789, + 23 * HOUR_NANOS + 59 * MINUTE_NANOS + 59 * SECOND_NANOS + 999_999_999, + ], + type=pyarrow.time64("ns"), + ), + ), ] From a6b739623d93500aafb7ee2e1181b5779c1fa662 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 8 Nov 2021 16:40:25 -0600 Subject: [PATCH 9/9] add time32("ms") test case without nulls for completeness --- tests/unit/test_arrow.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/unit/test_arrow.py b/tests/unit/test_arrow.py index 2bd7a5a..5f45a90 100644 --- a/tests/unit/test_arrow.py +++ b/tests/unit/test_arrow.py @@ -138,6 +138,24 @@ def types_mapper( type=pyarrow.time32("ms"), ), ), + ( + pandas.Series( + [ + dt.time(0, 0, 0, 0), + dt.time(12, 30, 15, 125_000), + dt.time(23, 59, 59, 999_000), + ], + dtype="dbtime", + ), + pyarrow.array( + [ + dt.time(0, 0, 0, 0), + dt.time(12, 30, 15, 125_000), + dt.time(23, 59, 59, 999_000), + ], + type=pyarrow.time32("ms"), + ), + ), ( pandas.Series( [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], dtype="dbtime",