Skip to content

Commit 141b2b4

Browse files
authored
ENH: Use tz-aware dtype for timestamp columns (#269)
ENH: Use tz-aware dtype for timestamp columns in all supported pandas versions Adds a table documenting this behavior to the "reading" how-to guides.
1 parent 0e1ebf5 commit 141b2b4

File tree

5 files changed

+132
-47
lines changed

5 files changed

+132
-47
lines changed

docs/source/changelog.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,12 @@ Changelog
1212
version. This is required to use new functionality such as the BigQuery
1313
Storage API. (:issue:`267`)
1414

15+
Documentation
16+
~~~~~~~~~~~~~
17+
18+
- Document :ref:`BigQuery data type to pandas dtype conversion
19+
<reading-dtypes>` for ``read_gbq``. (:issue:`269`)
20+
1521
Dependency updates
1622
~~~~~~~~~~~~~~~~~~
1723

@@ -27,11 +33,14 @@ Internal changes
2733

2834
Enhancements
2935
~~~~~~~~~~~~
36+
3037
- Allow ``table_schema`` in :func:`to_gbq` to contain only a subset of columns,
3138
with the rest being populated using the DataFrame dtypes (:issue:`218`)
3239
(contributed by @johnpaton)
3340
- Read ``project_id`` in :func:`to_gbq` from provided ``credentials`` if
3441
available (contributed by @daureg)
42+
- ``read_gbq`` uses the timezone-aware ``DatetimeTZDtype(unit='ns',
43+
tz='UTC')`` dtype for BigQuery ``TIMESTAMP`` columns. (:issue:`269`)
3544

3645
.. _changelog-0.9.0:
3746

docs/source/reading.rst

Lines changed: 47 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,32 @@ Suppose you want to load all data from an existing BigQuery table
99

1010
.. code-block:: python
1111
12-
# Insert your BigQuery Project ID Here
13-
# Can be found in the Google web console
12+
import pandas_gbq
13+
14+
# TODO: Set your BigQuery Project ID.
1415
projectid = "xxxxxxxx"
1516
16-
data_frame = read_gbq('SELECT * FROM test_dataset.test_table', projectid)
17+
data_frame = pandas_gbq.read_gbq(
18+
'SELECT * FROM `test_dataset.test_table`',
19+
project_id=projectid)
20+
21+
.. note::
1722

23+
A project ID is sometimes optional if it can be inferred during
24+
authentication, but it is required when authenticating with user
25+
credentials. You can find your project ID in the `Google Cloud console
26+
<https://console.cloud.google.com>`__.
1827

1928
You can define which column from BigQuery to use as an index in the
2029
destination DataFrame as well as a preferred column order as follows:
2130

2231
.. code-block:: python
2332
24-
data_frame = read_gbq('SELECT * FROM test_dataset.test_table',
25-
index_col='index_column_name',
26-
col_order=['col1', 'col2', 'col3'], projectid)
33+
data_frame = pandas_gbq.read_gbq(
34+
'SELECT * FROM `test_dataset.test_table`',
35+
project_id=projectid,
36+
index_col='index_column_name',
37+
col_order=['col1', 'col2', 'col3'])
2738
2839
2940
You can specify the query config as parameter to use additional options of
@@ -37,20 +48,39 @@ your job. For more information about query configuration parameters see `here
3748
"useQueryCache": False
3849
}
3950
}
40-
data_frame = read_gbq('SELECT * FROM test_dataset.test_table',
41-
configuration=configuration, projectid)
51+
data_frame = read_gbq(
52+
'SELECT * FROM `test_dataset.test_table`',
53+
project_id=projectid,
54+
configuration=configuration)
4255
4356
44-
.. note::
57+
The ``dialect`` argument can be used to indicate whether to use
58+
BigQuery's ``'legacy'`` SQL or BigQuery's ``'standard'`` SQL (beta). The
59+
default value is ``'standard'`` For more information on BigQuery's standard
60+
SQL, see `BigQuery SQL Reference
61+
<https://cloud.google.com/bigquery/docs/reference/standard-sql/>`__
4562

46-
You can find your project id in the `Google developers console
47-
<https://console.developers.google.com>`__.
63+
.. code-block:: python
4864
65+
data_frame = pandas_gbq.read_gbq(
66+
'SELECT * FROM [test_dataset.test_table]',
67+
project_id=projectid,
68+
dialect='legacy')
4969
50-
.. note::
5170
52-
The ``dialect`` argument can be used to indicate whether to use BigQuery's ``'legacy'`` SQL
53-
or BigQuery's ``'standard'`` SQL (beta). The default value is ``'legacy'``, though this will change
54-
in a subsequent release to ``'standard'``. For more information
55-
on BigQuery's standard SQL, see `BigQuery SQL Reference
56-
<https://cloud.google.com/bigquery/sql-reference/>`__
71+
.. _reading-dtypes:
72+
73+
Inferring the DataFrame's dtypes
74+
--------------------------------
75+
76+
The :func:`~pandas_gbq.read_gbq` method infers the pandas dtype for each column, based on the BigQuery table schema.
77+
78+
================== =========================
79+
BigQuery Data Type dtype
80+
================== =========================
81+
FLOAT float
82+
TIMESTAMP :class:`~pandas.DatetimeTZDtype` with ``unit='ns'`` and ``tz='UTC'``
83+
DATETIME datetime64[ns]
84+
TIME datetime64[ns]
85+
DATE datetime64[ns]
86+
================== =========================

pandas_gbq/gbq.py

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,9 @@ def run_query(self, query, **kwargs):
494494
if df.empty:
495495
df = _cast_empty_df_dtypes(schema_fields, df)
496496

497+
# Ensure any TIMESTAMP columns are tz-aware.
498+
df = _localize_df(schema_fields, df)
499+
497500
logger.debug("Got {} rows.\n".format(rows_iter.total_rows))
498501
return df
499502

@@ -644,17 +647,21 @@ def delete_and_recreate_table(self, dataset_id, table_id, table_schema):
644647

645648

646649
def _bqschema_to_nullsafe_dtypes(schema_fields):
647-
# Only specify dtype when the dtype allows nulls. Otherwise, use pandas's
648-
# default dtype choice.
649-
#
650-
# See:
651-
# http://pandas.pydata.org/pandas-docs/dev/missing_data.html
652-
# #missing-data-casting-rules-and-indexing
650+
"""Specify explicit dtypes based on BigQuery schema.
651+
652+
This function only specifies a dtype when the dtype allows nulls.
653+
Otherwise, use pandas's default dtype choice.
654+
655+
See: http://pandas.pydata.org/pandas-docs/dev/missing_data.html
656+
#missing-data-casting-rules-and-indexing
657+
"""
658+
# If you update this mapping, also update the table at
659+
# `docs/source/reading.rst`.
653660
dtype_map = {
654661
"FLOAT": np.dtype(float),
655-
# Even though TIMESTAMPs are timezone-aware in BigQuery, pandas doesn't
656-
# support datetime64[ns, UTC] as dtype in DataFrame constructors. See:
657-
# https://github.com/pandas-dev/pandas/issues/12513
662+
# pandas doesn't support timezone-aware dtype in DataFrame/Series
663+
# constructors. It's more idiomatic to localize after construction.
664+
# https://github.com/pandas-dev/pandas/issues/25843
658665
"TIMESTAMP": "datetime64[ns]",
659666
"TIME": "datetime64[ns]",
660667
"DATE": "datetime64[ns]",
@@ -702,6 +709,24 @@ def _cast_empty_df_dtypes(schema_fields, df):
702709
return df
703710

704711

712+
def _localize_df(schema_fields, df):
713+
"""Localize any TIMESTAMP columns to tz-aware type.
714+
715+
In pandas versions before 0.24.0, DatetimeTZDtype cannot be used as the
716+
dtype in Series/DataFrame construction, so localize those columns after
717+
the DataFrame is constructed.
718+
"""
719+
for field in schema_fields:
720+
column = str(field["name"])
721+
if field["mode"].upper() == "REPEATED":
722+
continue
723+
724+
if field["type"].upper() == "TIMESTAMP" and df[column].dt.tz is None:
725+
df[column] = df[column].dt.tz_localize("UTC")
726+
727+
return df
728+
729+
705730
def read_gbq(
706731
query,
707732
project_id=None,

tests/system/test_gbq.py

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -310,13 +310,15 @@ def test_should_properly_handle_timestamp_unix_epoch(self, project_id):
310310
credentials=self.credentials,
311311
dialect="legacy",
312312
)
313-
tm.assert_frame_equal(
314-
df,
315-
DataFrame(
316-
{"unix_epoch": ["1970-01-01T00:00:00.000000Z"]},
317-
dtype="datetime64[ns]",
318-
),
313+
expected = DataFrame(
314+
{"unix_epoch": ["1970-01-01T00:00:00.000000Z"]},
315+
dtype="datetime64[ns]",
319316
)
317+
if expected["unix_epoch"].dt.tz is None:
318+
expected["unix_epoch"] = expected["unix_epoch"].dt.tz_localize(
319+
"UTC"
320+
)
321+
tm.assert_frame_equal(df, expected)
320322

321323
def test_should_properly_handle_arbitrary_timestamp(self, project_id):
322324
query = 'SELECT TIMESTAMP("2004-09-15 05:00:00") AS valid_timestamp'
@@ -326,13 +328,15 @@ def test_should_properly_handle_arbitrary_timestamp(self, project_id):
326328
credentials=self.credentials,
327329
dialect="legacy",
328330
)
329-
tm.assert_frame_equal(
330-
df,
331-
DataFrame(
332-
{"valid_timestamp": ["2004-09-15T05:00:00.000000Z"]},
333-
dtype="datetime64[ns]",
334-
),
331+
expected = DataFrame(
332+
{"valid_timestamp": ["2004-09-15T05:00:00.000000Z"]},
333+
dtype="datetime64[ns]",
335334
)
335+
if expected["valid_timestamp"].dt.tz is None:
336+
expected["valid_timestamp"] = expected[
337+
"valid_timestamp"
338+
].dt.tz_localize("UTC")
339+
tm.assert_frame_equal(df, expected)
336340

337341
def test_should_properly_handle_datetime_unix_epoch(self, project_id):
338342
query = 'SELECT DATETIME("1970-01-01 00:00:00") AS unix_epoch'
@@ -368,7 +372,7 @@ def test_should_properly_handle_arbitrary_datetime(self, project_id):
368372
"expression, is_expected_dtype",
369373
[
370374
("current_date()", pandas.api.types.is_datetime64_ns_dtype),
371-
("current_timestamp()", pandas.api.types.is_datetime64_ns_dtype),
375+
("current_timestamp()", pandas.api.types.is_datetime64tz_dtype),
372376
("current_datetime()", pandas.api.types.is_datetime64_ns_dtype),
373377
("TRUE", pandas.api.types.is_bool_dtype),
374378
("FALSE", pandas.api.types.is_bool_dtype),
@@ -402,9 +406,11 @@ def test_should_properly_handle_null_timestamp(self, project_id):
402406
credentials=self.credentials,
403407
dialect="legacy",
404408
)
405-
tm.assert_frame_equal(
406-
df, DataFrame({"null_timestamp": [NaT]}, dtype="datetime64[ns]")
409+
expected = DataFrame({"null_timestamp": [NaT]}, dtype="datetime64[ns]")
410+
expected["null_timestamp"] = expected["null_timestamp"].dt.tz_localize(
411+
"UTC"
407412
)
413+
tm.assert_frame_equal(df, expected)
408414

409415
def test_should_properly_handle_null_datetime(self, project_id):
410416
query = "SELECT CAST(NULL AS DATETIME) AS null_datetime"
@@ -594,6 +600,7 @@ def test_zero_rows(self, project_id):
594600
expected_result = DataFrame(
595601
empty_columns, columns=["title", "id", "is_bot", "ts"]
596602
)
603+
expected_result["ts"] = expected_result["ts"].dt.tz_localize("UTC")
597604
tm.assert_frame_equal(df, expected_result, check_index_type=False)
598605

599606
def test_one_row_one_column(self, project_id):

tests/unit/test_gbq.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,26 @@
11
# -*- coding: utf-8 -*-
22

3-
import pandas.util.testing as tm
4-
import pytest
3+
try:
4+
import mock
5+
except ImportError: # pragma: NO COVER
6+
from unittest import mock
7+
58
import numpy
69
from pandas import DataFrame
10+
import pandas.util.testing as tm
11+
import pkg_resources
12+
import pytest
713

814
import pandas_gbq.exceptions
915
from pandas_gbq import gbq
1016

11-
try:
12-
import mock
13-
except ImportError: # pragma: NO COVER
14-
from unittest import mock
1517

1618
pytestmark = pytest.mark.filter_warnings(
1719
"ignore:credentials from Google Cloud SDK"
1820
)
21+
pandas_installed_version = pkg_resources.get_distribution(
22+
"pandas"
23+
).parsed_version
1924

2025

2126
@pytest.fixture
@@ -90,6 +95,7 @@ def no_auth(monkeypatch):
9095
("INTEGER", None), # Can't handle NULL
9196
("BOOLEAN", None), # Can't handle NULL
9297
("FLOAT", numpy.dtype(float)),
98+
# TIMESTAMP will be localized after DataFrame construction.
9399
("TIMESTAMP", "datetime64[ns]"),
94100
("DATETIME", "datetime64[ns]"),
95101
],
@@ -200,6 +206,10 @@ def test_to_gbq_with_verbose_old_pandas_no_warnings(recwarn, min_bq_version):
200206
assert len(recwarn) == 0
201207

202208

209+
@pytest.mark.skipif(
210+
pandas_installed_version < pkg_resources.parse_version("0.24.0"),
211+
reason="Requires pandas 0.24+",
212+
)
203213
def test_to_gbq_with_private_key_new_pandas_warns_deprecation(
204214
min_bq_version, monkeypatch
205215
):
@@ -413,6 +423,10 @@ def test_read_gbq_with_verbose_old_pandas_no_warnings(recwarn, min_bq_version):
413423
assert len(recwarn) == 0
414424

415425

426+
@pytest.mark.skipif(
427+
pandas_installed_version < pkg_resources.parse_version("0.24.0"),
428+
reason="Requires pandas 0.24+",
429+
)
416430
def test_read_gbq_with_private_key_new_pandas_warns_deprecation(
417431
min_bq_version, monkeypatch
418432
):

0 commit comments

Comments
 (0)