From 8399215d467a92a7862feaf9a0808ea8aa433aa3 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 3 Dec 2021 15:48:11 -0600 Subject: [PATCH 1/3] fix: raise ValueError if date is out-of-bounds --- db_dtypes/__init__.py | 4 ++-- tests/unit/test_arrow.py | 3 ++- tests/unit/test_date.py | 5 +++++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py index f1424fb..1741791 100644 --- a/db_dtypes/__init__.py +++ b/db_dtypes/__init__.py @@ -233,7 +233,7 @@ def _datetime( if scalar is None: return None elif isinstance(scalar, datetime.date): - return datetime.datetime(scalar.year, scalar.month, scalar.day) + return pandas.Timestamp(scalar.year, scalar.month, scalar.day) elif isinstance(scalar, str): match = match_fn(scalar) if not match: @@ -241,7 +241,7 @@ def _datetime( year = int(match.group("year")) month = int(match.group("month")) day = int(match.group("day")) - return datetime.datetime(year, month, day) + return pandas.Timestamp(year, month, day) else: raise TypeError("Invalid value type", scalar) diff --git a/tests/unit/test_arrow.py b/tests/unit/test_arrow.py index 5f45a90..24e0252 100644 --- a/tests/unit/test_arrow.py +++ b/tests/unit/test_arrow.py @@ -183,7 +183,7 @@ def types_mapper( type=pyarrow.time64("us"), ), ), - ( + pytest.param( pandas.Series( [ # Only microseconds are supported when reading data. See: @@ -216,6 +216,7 @@ def types_mapper( ], type=pyarrow.time64("ns"), ), + id="time-nanoseconds-arrow-round-trip", ), ] diff --git a/tests/unit/test_date.py b/tests/unit/test_date.py index c919f6d..b906f24 100644 --- a/tests/unit/test_date.py +++ b/tests/unit/test_date.py @@ -55,6 +55,11 @@ def test_date_parsing(value, expected): ("2021-2-99", "day is out of range for month"), ("2021-99-1", "month must be in 1[.][.]12"), ("10000-1-1", "year 10000 is out of range"), + # Outside of min/max values pandas.Timestamp. + ("0001-01-01", "Out of bounds"), + ("9999-12-31", "Out of bounds"), + ("1677-09-21", "Out of bounds"), + ("2262-04-12", "Out of bounds"), ], ) def test_date_parsing_errors(value, error): From f95e3962a432a11a41d88b89c2676bdf172617d5 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 3 Dec 2021 17:20:59 -0600 Subject: [PATCH 2/3] unify _datetime return type --- db_dtypes/__init__.py | 50 ++++++++++++++++++++++++++----------------- db_dtypes/core.py | 4 +--- 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py index 1741791..056be28 100644 --- a/db_dtypes/__init__.py +++ b/db_dtypes/__init__.py @@ -17,7 +17,7 @@ import datetime import re -from typing import Union +from typing import Optional, Union import numpy import packaging.version @@ -103,7 +103,7 @@ def _datetime( r"(?::(?P\d+)" r"(?:\.(?P\d*))?)?)?\s*$" ).match, - ): + ) -> Optional[numpy.datetime64]: # Convert pyarrow values to datetime.time. if isinstance(scalar, (pyarrow.Time32Scalar, pyarrow.Time64Scalar)): scalar = ( @@ -115,8 +115,16 @@ def _datetime( if scalar is None: return None - elif isinstance(scalar, datetime.time): - return datetime.datetime.combine(_EPOCH, scalar) + if isinstance(scalar, datetime.time): + return pandas.Timestamp( + year=1970, + month=1, + day=1, + hour=scalar.hour, + minute=scalar.minute, + second=scalar.second, + microsecond=scalar.microsecond, + ).to_datetime64() elif isinstance(scalar, pandas.Timestamp): return scalar.to_datetime64() elif isinstance(scalar, str): @@ -125,20 +133,20 @@ def _datetime( if not parsed: raise ValueError(f"Bad time string: {repr(scalar)}") - hours = parsed.group("hours") - minutes = parsed.group("minutes") - seconds = parsed.group("seconds") + hour = parsed.group("hours") + minute = parsed.group("minutes") + second = parsed.group("seconds") fraction = parsed.group("fraction") - microseconds = int(fraction.ljust(6, "0")[:6]) if fraction else 0 - return datetime.datetime( - 1970, - 1, - 1, - int(hours), - int(minutes) if minutes else 0, - int(seconds) if seconds else 0, - microseconds, - ) + nanosecond = int(fraction.ljust(9, "0")[:9]) if fraction else 0 + return pandas.Timestamp( + year=1970, + month=1, + day=1, + hour=int(hour), + minute=int(minute) if minute else 0, + second=int(second) if second else 0, + nanosecond=nanosecond, + ).to_datetime64() else: raise TypeError("Invalid value type", scalar) @@ -225,7 +233,7 @@ class DateArray(core.BaseDatetimeArray): def _datetime( scalar, match_fn=re.compile(r"\s*(?P\d+)-(?P\d+)-(?P\d+)\s*$").match, - ): + ) -> Optional[numpy.datetime64]: # Convert pyarrow values to datetime.date. if isinstance(scalar, (pyarrow.Date32Scalar, pyarrow.Date64Scalar)): scalar = scalar.as_py() @@ -233,7 +241,9 @@ def _datetime( if scalar is None: return None elif isinstance(scalar, datetime.date): - return pandas.Timestamp(scalar.year, scalar.month, scalar.day) + return pandas.Timestamp( + year=scalar.year, month=scalar.month, day=scalar.day + ).to_datetime64() elif isinstance(scalar, str): match = match_fn(scalar) if not match: @@ -241,7 +251,7 @@ def _datetime( year = int(match.group("year")) month = int(match.group("month")) day = int(match.group("day")) - return pandas.Timestamp(year, month, day) + return pandas.Timestamp(year=year, month=month, day=day).to_datetime64() else: raise TypeError("Invalid value type", scalar) diff --git a/db_dtypes/core.py b/db_dtypes/core.py index c8f3ad4..3ade198 100644 --- a/db_dtypes/core.py +++ b/db_dtypes/core.py @@ -127,9 +127,7 @@ def take( if allow_fill: fill_value = self._validate_scalar(fill_value) fill_value = ( - numpy.datetime64() - if fill_value is None - else numpy.datetime64(self._datetime(fill_value)) + numpy.datetime64() if fill_value is None else self._datetime(fill_value) ) if (indices < -1).any(): raise ValueError( From 2ea4e51d6923281d8af49efe9746e4b3afc0da96 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 3 Dec 2021 17:25:59 -0600 Subject: [PATCH 3/3] add relevant unit test --- tests/unit/test_arrow.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_arrow.py b/tests/unit/test_arrow.py index 24e0252..4d4fc50 100644 --- a/tests/unit/test_arrow.py +++ b/tests/unit/test_arrow.py @@ -183,13 +183,13 @@ def types_mapper( type=pyarrow.time64("us"), ), ), + # Only microseconds are supported when reading data. See: + # https://github.com/googleapis/python-db-dtypes-pandas/issues/19 + # Still, round-trip with pyarrow nanosecond precision scalars + # is supported. pytest.param( pandas.Series( [ - # Only microseconds are supported when reading data. See: - # https://github.com/googleapis/python-db-dtypes-pandas/issues/19 - # Still, round-trip with pyarrow nanosecond precision scalars - # is supported. pyarrow.scalar(0, pyarrow.time64("ns")), pyarrow.scalar( 12 * HOUR_NANOS @@ -218,6 +218,20 @@ def types_mapper( ), id="time-nanoseconds-arrow-round-trip", ), + pytest.param( + pandas.Series( + ["0:0:0", "12:30:15.123456789", "23:59:59.999999999"], dtype="dbtime", + ), + pyarrow.array( + [ + 0, + 12 * HOUR_NANOS + 30 * MINUTE_NANOS + 15 * SECOND_NANOS + 123_456_789, + 23 * HOUR_NANOS + 59 * MINUTE_NANOS + 59 * SECOND_NANOS + 999_999_999, + ], + type=pyarrow.time64("ns"), + ), + id="time-nanoseconds-arrow-from-string", + ), ]