From e793ec6c7ed56074d8fa12e2ff27905ce8c31191 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 27 Sep 2021 11:00:00 -0500 Subject: [PATCH 1/4] fix: support converting empty `time` Series to pyarrow Array --- tests/unit/test_arrow.py | 70 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 tests/unit/test_arrow.py diff --git a/tests/unit/test_arrow.py b/tests/unit/test_arrow.py new file mode 100644 index 0000000..363af17 --- /dev/null +++ b/tests/unit/test_arrow.py @@ -0,0 +1,70 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime as dt + +import pandas +import pyarrow +import pytest + +# To register the types. +import db_dtypes # noqa + + +@pytest.mark.parametrize( + ("series", "expected"), + ( + (pandas.Series([], dtype="date"), pyarrow.array([], type=pyarrow.date32())), + ( + pandas.Series([None, None, None], dtype="date"), + pyarrow.array([None, None, None], type=pyarrow.date32()), + ), + ( + pandas.Series( + [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], dtype="date" + ), + pyarrow.array( + [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], + type=pyarrow.date32(), + ), + ), + ( + pandas.Series( + [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], + dtype="date", + ), + pyarrow.array( + [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], + type=pyarrow.date32(), + ), + ), + (pandas.Series([], dtype="time"), pyarrow.array([], type=pyarrow.time64("ns"))), + ( + pandas.Series([None, None, None], dtype="time"), + pyarrow.array([None, None, None], type=pyarrow.time64("ns")), + ), + ( + pandas.Series( + [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], dtype="time" + ), + pyarrow.array( + [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], + type=pyarrow.time64("ns"), + ), + ), + ), +) +def test_to_arrow(series, expected): + array = pyarrow.array(series) + assert array.equals(expected) From e08f7fc5e78675b59b53f41865eb149f121d79d9 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 27 Sep 2021 15:36:10 -0500 Subject: [PATCH 2/4] use object dtype for time numpy array --- db_dtypes/__init__.py | 3 +- tests/unit/test_arrow.py | 93 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 1 deletion(-) diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py index 8a58666..7687992 100644 --- a/db_dtypes/__init__.py +++ b/db_dtypes/__init__.py @@ -98,7 +98,8 @@ def astype(self, dtype, copy=True): def __arrow_array__(self, type=None): return pyarrow.array( - self.to_numpy(), type=type if type is not None else pyarrow.time64("ns"), + self.to_numpy(dtype="object"), + type=type if type is not None else pyarrow.time64("ns"), ) diff --git a/tests/unit/test_arrow.py b/tests/unit/test_arrow.py index 363af17..dd0aed7 100644 --- a/tests/unit/test_arrow.py +++ b/tests/unit/test_arrow.py @@ -63,8 +63,101 @@ type=pyarrow.time64("ns"), ), ), + ( + pandas.Series( + [ + dt.time(0, 0, 0, 0), + dt.time(12, 30, 15, 125_000), + dt.time(23, 59, 59, 999_999), + ], + dtype="time", + ), + pyarrow.array( + [ + dt.time(0, 0, 0, 0), + dt.time(12, 30, 15, 125_000), + dt.time(23, 59, 59, 999_999), + ], + type=pyarrow.time64("ns"), + ), + ), ), ) def test_to_arrow(series, expected): array = pyarrow.array(series) assert array.equals(expected) + + +@pytest.mark.parametrize( + ("series", "expected"), + ( + (pandas.Series([], dtype="date"), pyarrow.array([], type=pyarrow.date64())), + ( + pandas.Series([None, None, None], dtype="date"), + pyarrow.array([None, None, None], type=pyarrow.date64()), + ), + ( + pandas.Series( + [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], dtype="date" + ), + pyarrow.array( + [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], + type=pyarrow.date64(), + ), + ), + ( + pandas.Series( + [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], + dtype="date", + ), + pyarrow.array( + [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], + type=pyarrow.date64(), + ), + ), + (pandas.Series([], dtype="time"), pyarrow.array([], type=pyarrow.time32("ms"))), + ( + pandas.Series([None, None, None], dtype="time"), + pyarrow.array([None, None, None], type=pyarrow.time32("ms")), + ), + ( + pandas.Series( + [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_000)], dtype="time" + ), + pyarrow.array( + [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_000)], + type=pyarrow.time32("ms"), + ), + ), + ( + pandas.Series( + [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], dtype="time" + ), + pyarrow.array( + [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], + type=pyarrow.time64("us"), + ), + ), + ( + pandas.Series( + [ + dt.time(0, 0, 0, 0), + dt.time(12, 30, 15, 125_000), + dt.time(23, 59, 59, 999_999), + ], + dtype="time", + ), + pyarrow.array( + [ + dt.time(0, 0, 0, 0), + dt.time(12, 30, 15, 125_000), + dt.time(23, 59, 59, 999_999), + ], + type=pyarrow.time64("us"), + ), + ), + ), +) +def test_to_arrow_w_arrow_type(series, expected): + array = pyarrow.array(series, type=expected.type) + assert array.equals(expected) From 281afb9653bfebfcabe14e7841c397e4deaa1c3d Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 28 Sep 2021 14:45:52 -0500 Subject: [PATCH 3/4] backport to_numpy --- db_dtypes/__init__.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py index 7687992..2b18587 100644 --- a/db_dtypes/__init__.py +++ b/db_dtypes/__init__.py @@ -18,6 +18,7 @@ import datetime import numpy +import packaging.version import pandas import pandas.compat.numpy.function import pandas.core.algorithms @@ -35,6 +36,8 @@ date_dtype_name = "date" time_dtype_name = "time" +pandas_release = packaging.version.parse(pandas.__version__).release + @pandas.core.dtypes.dtypes.register_extension_dtype class TimeDtype(core.BaseDatetimeDtype): @@ -96,6 +99,11 @@ def astype(self, dtype, copy=True): else: return super().astype(dtype, copy=copy) + if pandas_release < (1,): + + def to_numpy(self, dtype="object"): + return self.astype(dtype) + def __arrow_array__(self, type=None): return pyarrow.array( self.to_numpy(dtype="object"), From 63b946c8c04ad57f9ec298b85961c9206433c607 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 28 Sep 2021 17:05:05 -0500 Subject: [PATCH 4/4] remove redundant test --- tests/unit/test_dtypes.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/tests/unit/test_dtypes.py b/tests/unit/test_dtypes.py index eca3a31..118458e 100644 --- a/tests/unit/test_dtypes.py +++ b/tests/unit/test_dtypes.py @@ -15,7 +15,6 @@ import datetime import packaging.version -import pyarrow.lib import pytest pd = pytest.importorskip("pandas") @@ -670,13 +669,3 @@ def test_bad_time_parsing(value, error): def test_bad_date_parsing(value, error): with pytest.raises(ValueError, match=error): _cls("date")([value]) - - -@for_date_and_time -def test_date___arrow__array__(dtype): - a = _make_one(dtype) - ar = a.__arrow_array__() - assert isinstance( - ar, pyarrow.Date32Array if dtype == "date" else pyarrow.Time64Array, - ) - assert [v.as_py() for v in ar] == list(a)