diff --git a/docs/reading.rst b/docs/reading.rst index 5fa369a7..d48504f8 100644 --- a/docs/reading.rst +++ b/docs/reading.rst @@ -56,17 +56,17 @@ Inferring the DataFrame's dtypes The :func:`~pandas_gbq.read_gbq` method infers the pandas dtype for each column, based on the BigQuery table schema. -================== ========================= +================== ============================================ BigQuery Data Type dtype -================== ========================= +================== ============================================ BOOL boolean INT64 Int64 FLOAT64 float64 TIME dbtime DATE dbdate or object -DATETIME datetime64[ns] or object -TIMESTAMP datetime64[ns, UTC] or object -================== ========================= +DATETIME datetime64[ns] (datetime64[us] if pandas version >= 2.1.0) or object +TIMESTAMP datetime64[ns, UTC] (datetime64[us, UTC] if pandas version >= 2.1.0) or object +================== ============================================ If any DATE/DATETIME/TIMESTAMP value is outside of the range of `pandas.Timestamp.min `__ diff --git a/pandas_gbq/features.py b/pandas_gbq/features.py index 62405b4c..b9358f7b 100644 --- a/pandas_gbq/features.py +++ b/pandas_gbq/features.py @@ -9,6 +9,7 @@ BIGQUERY_QUERY_AND_WAIT_VERSION = "3.14.0" PANDAS_VERBOSITY_DEPRECATION_VERSION = "0.23.0" PANDAS_BOOLEAN_DTYPE_VERSION = "1.0.0" +PANDAS_MICROSECTONDS_DATETIME_VERSION = "2.1.0" class Features: @@ -81,5 +82,12 @@ def pandas_has_boolean_dtype(self): desired_version = packaging.version.parse(PANDAS_BOOLEAN_DTYPE_VERSION) return self.pandas_installed_version >= desired_version + @property + def pandas_has_microseconds_datetime(self): + import packaging.version + + desired_version = packaging.version.parse(PANDAS_MICROSECTONDS_DATETIME_VERSION) + return self.pandas_installed_version >= desired_version + FEATURES = Features() diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index feffd858..f4373252 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -630,6 +630,7 @@ def _finalize_dtypes( """ import db_dtypes import pandas.api.types + import pandas # If you update this mapping, also update the table at # `docs/reading.rst`. @@ -638,6 +639,14 @@ def _finalize_dtypes( "DATETIME": "datetime64[ns]", "TIMESTAMP": "datetime64[ns]", } + if FEATURES.pandas_has_microseconds_datetime: + # when pandas is 2.1.0 or later, default timestamp dtype is 'datetime64[us]' + # and we should use 'datetime64[us]' instead of 'datetime64[ns]' + dtype_map = { + "DATE": db_dtypes.DateDtype(), + "DATETIME": "datetime64[us]", + "TIMESTAMP": pandas.DatetimeTZDtype(unit="us", tz="UTC"), + } for field in schema_fields: # This method doesn't modify ARRAY/REPEATED columns. diff --git a/testing/constraints-3.10.txt b/testing/constraints-3.10.txt index e69de29b..a3deb24e 100644 --- a/testing/constraints-3.10.txt +++ b/testing/constraints-3.10.txt @@ -0,0 +1,2 @@ +numpy==1.26.4 +pandas==2.0.3 diff --git a/testing/constraints-3.11.txt b/testing/constraints-3.11.txt index e69de29b..10185663 100644 --- a/testing/constraints-3.11.txt +++ b/testing/constraints-3.11.txt @@ -0,0 +1 @@ +pandas==2.1.4 diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 76864a66..47012bef 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -1,2 +1,2 @@ -numpy==1.19.4 -pandas==1.1.4 +numpy==1.20.3 +pandas==1.5.3 diff --git a/tests/system/test_read_gbq.py b/tests/system/test_read_gbq.py index 4ae96a36..06692c2d 100644 --- a/tests/system/test_read_gbq.py +++ b/tests/system/test_read_gbq.py @@ -16,6 +16,7 @@ from pandas_gbq.features import FEATURES + QueryTestCase = collections.namedtuple( "QueryTestCase", ["query", "expected", "use_bqstorage_apis"], @@ -628,7 +629,9 @@ def test_empty_dataframe(read_gbq, use_bqstorage_api): ), "datetime_col": pandas.Series( [], - dtype="datetime64[ns]", + dtype="datetime64[us]" + if FEATURES.pandas_has_microseconds_datetime + else "datetime64[ns]", ), "float_col": pandas.Series([], dtype="float64"), "int64_col": pandas.Series([], dtype="Int64"), @@ -640,8 +643,10 @@ def test_empty_dataframe(read_gbq, use_bqstorage_api): ), "timestamp_col": pandas.Series( [], - dtype="datetime64[ns]", - ).dt.tz_localize(datetime.timezone.utc), + dtype=pandas.DatetimeTZDtype(unit="us", tz="UTC") + if FEATURES.pandas_has_microseconds_datetime + else pandas.DatetimeTZDtype(tz="UTC"), + ), } ) result = read_gbq(query, use_bqstorage_api=use_bqstorage_api) diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index 139f072b..17d8bb13 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -16,6 +16,9 @@ pytest.importorskip("google.cloud.bigquery", minversion="1.24.0") +PANDAS_VERSION = tuple(int(part) for part in pandas.__version__.split(".")[:2]) + + @pytest.fixture(params=["load_parquet", "load_csv"]) def api_method(request): return request.param @@ -343,25 +346,33 @@ def test_series_round_trip( # require `date_as_object` parameter in # google-cloud-bigquery versions 1.x and 2.x, but not 3.x. # https://github.com/googleapis/python-bigquery-pandas/issues/365 - "datetime_col": [ - datetime.datetime(1, 1, 1), - datetime.datetime(1970, 1, 1), - datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), - ], - "timestamp_col": [ - datetime.datetime(1, 1, 1, tzinfo=datetime.timezone.utc), - datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc), - datetime.datetime( - 9999, - 12, - 31, - 23, - 59, - 59, - 999999, - tzinfo=datetime.timezone.utc, - ), - ], + "datetime_col": pandas.Series( + [ + datetime.datetime(1, 1, 1), + datetime.datetime(1970, 1, 1), + datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), + ], + dtype="object" if PANDAS_VERSION < (2, 1) else "datetime64[us]", + ), + "timestamp_col": pandas.Series( + [ + datetime.datetime(1, 1, 1, tzinfo=datetime.timezone.utc), + datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc), + datetime.datetime( + 9999, + 12, + 31, + 23, + 59, + 59, + 999999, + tzinfo=datetime.timezone.utc, + ), + ], + dtype="object" + if PANDAS_VERSION < (2, 1) + else pandas.DatetimeTZDtype(unit="us", tz="UTC"), + ), }, columns=["row_num", "date_col", "datetime_col", "timestamp_col"], ), diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index 75574820..c49f2157 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -113,6 +113,62 @@ def test__bqschema_to_nullsafe_dtypes(type_, expected): assert result == {"x": expected} +@pytest.mark.parametrize( + ("data", "schema_type", "expected"), + [ + pytest.param( + pandas.to_datetime(["2017-01-01T12:00:00Z"]).astype( + pandas.DatetimeTZDtype( + unit="us" if FEATURES.pandas_has_microseconds_datetime else "ns", + tz="UTC", + ), + ), + "TIMESTAMP", + pandas.DatetimeTZDtype( + unit="us" if FEATURES.pandas_has_microseconds_datetime else "ns", + tz="UTC", + ), + ), + ( + pandas.to_datetime([]).astype(object), + "TIMESTAMP", + pandas.DatetimeTZDtype( + unit="us" if FEATURES.pandas_has_microseconds_datetime else "ns", + tz="UTC", + ), + ), + ( + pandas.to_datetime(["2017-01-01T12:00:00"]).astype( + "datetime64[us]" + if FEATURES.pandas_has_microseconds_datetime + else "datetime64[ns]", + ), + "DATETIME", + numpy.dtype( + "datetime64[us]" + if FEATURES.pandas_has_microseconds_datetime + else "datetime64[ns]", + ), + ), + ( + pandas.to_datetime([]).astype(object), + "DATETIME", + numpy.dtype( + "datetime64[us]" + if FEATURES.pandas_has_microseconds_datetime + else "datetime64[ns]", + ), + ), + ], +) +def test__finalize_dtypes(data, schema_type, expected): + result = gbq._finalize_dtypes( + pandas.DataFrame(dict(x=data)), + [dict(name="x", type=schema_type, mode="NULLABLE")], + ) + assert result["x"].dtype == expected + + @pytest.mark.parametrize( ["query_or_table", "expected"], [