ENH: Use tz-aware dtype for timestamp columns (#269)

tswast · web-flow · commit 141b2b4131d8 · 2019-04-03T16:13:21.000-07:00
ENH: Use tz-aware dtype for timestamp columns in all supported pandas versions

Adds a table documenting this behavior to the "reading" how-to guides.
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -12,6 +12,12 @@ Changelog
   version. This is required to use new functionality such as the BigQuery
   Storage API. (:issue:`267`)
 
+Documentation
+~~~~~~~~~~~~~
+
+- Document :ref:`BigQuery data type to pandas dtype conversion
+  <reading-dtypes>` for ``read_gbq``. (:issue:`269`)
+
 Dependency updates
 ~~~~~~~~~~~~~~~~~~
 
@@ -27,11 +33,14 @@ Internal changes
 
 Enhancements
 ~~~~~~~~~~~~
+
 - Allow ``table_schema`` in :func:`to_gbq` to contain only a subset of columns,
   with the rest being populated using the DataFrame dtypes (:issue:`218`)
   (contributed by @johnpaton)
 - Read ``project_id`` in :func:`to_gbq` from provided ``credentials`` if
   available (contributed by @daureg)
+- ``read_gbq`` uses the timezone-aware ``DatetimeTZDtype(unit='ns',
+  tz='UTC')`` dtype for BigQuery ``TIMESTAMP`` columns. (:issue:`269`)
 
 .. _changelog-0.9.0:
 
diff --git a/docs/source/reading.rst b/docs/source/reading.rst
@@ -9,21 +9,32 @@ Suppose you want to load all data from an existing BigQuery table
 
 .. code-block:: python
 
-   # Insert your BigQuery Project ID Here
-   # Can be found in the Google web console
+   import pandas_gbq
+
+   # TODO: Set your BigQuery Project ID.
    projectid = "xxxxxxxx"
 
-   data_frame = read_gbq('SELECT * FROM test_dataset.test_table', projectid)
+   data_frame = pandas_gbq.read_gbq(
+       'SELECT * FROM `test_dataset.test_table`',
+       project_id=projectid)
+
+.. note::
 
+    A project ID is sometimes optional if it can be inferred during
+    authentication, but it is required when authenticating with user
+    credentials. You can find your project ID in the `Google Cloud console
+    <https://console.cloud.google.com>`__.
 
 You can define which column from BigQuery to use as an index in the
 destination DataFrame as well as a preferred column order as follows:
 
 .. code-block:: python
 
-   data_frame = read_gbq('SELECT * FROM test_dataset.test_table',
-                          index_col='index_column_name',
-                          col_order=['col1', 'col2', 'col3'], projectid)
+   data_frame = pandas_gbq.read_gbq(
+       'SELECT * FROM `test_dataset.test_table`',
+       project_id=projectid,
+       index_col='index_column_name',
+       col_order=['col1', 'col2', 'col3'])
 
 
 You can specify the query config as parameter to use additional options of
@@ -37,20 +48,39 @@ your job. For more information about query configuration parameters see `here
         "useQueryCache": False
       }
    }
-   data_frame = read_gbq('SELECT * FROM test_dataset.test_table',
-                          configuration=configuration, projectid)
+   data_frame = read_gbq(
+       'SELECT * FROM `test_dataset.test_table`',
+       project_id=projectid,
+       configuration=configuration)
 
 
-.. note::
+The ``dialect`` argument can be used to indicate whether to use
+BigQuery's ``'legacy'`` SQL or BigQuery's ``'standard'`` SQL (beta). The
+default value is ``'standard'`` For more information on BigQuery's standard
+SQL, see `BigQuery SQL Reference
+<https://cloud.google.com/bigquery/docs/reference/standard-sql/>`__
 
-   You can find your project id in the `Google developers console
-   <https://console.developers.google.com>`__.
+.. code-block:: python
 
+   data_frame = pandas_gbq.read_gbq(
+       'SELECT * FROM [test_dataset.test_table]',
+       project_id=projectid,
+       dialect='legacy')
 
-.. note::
 
-    The ``dialect`` argument can be used to indicate whether to use BigQuery's ``'legacy'`` SQL
-    or BigQuery's ``'standard'`` SQL (beta). The default value is ``'legacy'``, though this will change
-    in a subsequent release to ``'standard'``. For more information
-    on BigQuery's standard SQL, see `BigQuery SQL Reference
-    <https://cloud.google.com/bigquery/sql-reference/>`__
+.. _reading-dtypes:
+
+Inferring the DataFrame's dtypes
+--------------------------------
+
+The :func:`~pandas_gbq.read_gbq` method infers the pandas dtype for each column, based on the BigQuery table schema.
+
+================== =========================
+BigQuery Data Type dtype
+================== =========================
+FLOAT              float
+TIMESTAMP          :class:`~pandas.DatetimeTZDtype` with ``unit='ns'`` and ``tz='UTC'``
+DATETIME           datetime64[ns]
+TIME               datetime64[ns]
+DATE               datetime64[ns]
+================== =========================
diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py
@@ -494,6 +494,9 @@ def run_query(self, query, **kwargs):
         if df.empty:
             df = _cast_empty_df_dtypes(schema_fields, df)
 
+        # Ensure any TIMESTAMP columns are tz-aware.
+        df = _localize_df(schema_fields, df)
+
         logger.debug("Got {} rows.\n".format(rows_iter.total_rows))
         return df
 
@@ -644,17 +647,21 @@ def delete_and_recreate_table(self, dataset_id, table_id, table_schema):
 
 
 def _bqschema_to_nullsafe_dtypes(schema_fields):
-    # Only specify dtype when the dtype allows nulls. Otherwise, use pandas's
-    # default dtype choice.
-    #
-    # See:
-    # http://pandas.pydata.org/pandas-docs/dev/missing_data.html
-    # #missing-data-casting-rules-and-indexing
+    """Specify explicit dtypes based on BigQuery schema.
+
+    This function only specifies a dtype when the dtype allows nulls.
+    Otherwise, use pandas's default dtype choice.
+
+    See: http://pandas.pydata.org/pandas-docs/dev/missing_data.html
+    #missing-data-casting-rules-and-indexing
+    """
+    # If you update this mapping, also update the table at
+    # `docs/source/reading.rst`.
     dtype_map = {
         "FLOAT": np.dtype(float),
-        # Even though TIMESTAMPs are timezone-aware in BigQuery, pandas doesn't
-        # support datetime64[ns, UTC] as dtype in DataFrame constructors. See:
-        # https://github.com/pandas-dev/pandas/issues/12513
+        # pandas doesn't support timezone-aware dtype in DataFrame/Series
+        # constructors. It's more idiomatic to localize after construction.
+        # https://github.com/pandas-dev/pandas/issues/25843
         "TIMESTAMP": "datetime64[ns]",
         "TIME": "datetime64[ns]",
         "DATE": "datetime64[ns]",
@@ -702,6 +709,24 @@ def _cast_empty_df_dtypes(schema_fields, df):
     return df
 
 
+def _localize_df(schema_fields, df):
+    """Localize any TIMESTAMP columns to tz-aware type.
+
+    In pandas versions before 0.24.0, DatetimeTZDtype cannot be used as the
+    dtype in Series/DataFrame construction, so localize those columns after
+    the DataFrame is constructed.
+    """
+    for field in schema_fields:
+        column = str(field["name"])
+        if field["mode"].upper() == "REPEATED":
+            continue
+
+        if field["type"].upper() == "TIMESTAMP" and df[column].dt.tz is None:
+            df[column] = df[column].dt.tz_localize("UTC")
+
+    return df
+
+
 def read_gbq(
     query,
     project_id=None,
diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py
@@ -310,13 +310,15 @@ def test_should_properly_handle_timestamp_unix_epoch(self, project_id):
             credentials=self.credentials,
             dialect="legacy",
         )
-        tm.assert_frame_equal(
-            df,
-            DataFrame(
-                {"unix_epoch": ["1970-01-01T00:00:00.000000Z"]},
-                dtype="datetime64[ns]",
-            ),
+        expected = DataFrame(
+            {"unix_epoch": ["1970-01-01T00:00:00.000000Z"]},
+            dtype="datetime64[ns]",
         )
+        if expected["unix_epoch"].dt.tz is None:
+            expected["unix_epoch"] = expected["unix_epoch"].dt.tz_localize(
+                "UTC"
+            )
+        tm.assert_frame_equal(df, expected)
 
     def test_should_properly_handle_arbitrary_timestamp(self, project_id):
         query = 'SELECT TIMESTAMP("2004-09-15 05:00:00") AS valid_timestamp'
@@ -326,13 +328,15 @@ def test_should_properly_handle_arbitrary_timestamp(self, project_id):
             credentials=self.credentials,
             dialect="legacy",
         )
-        tm.assert_frame_equal(
-            df,
-            DataFrame(
-                {"valid_timestamp": ["2004-09-15T05:00:00.000000Z"]},
-                dtype="datetime64[ns]",
-            ),
+        expected = DataFrame(
+            {"valid_timestamp": ["2004-09-15T05:00:00.000000Z"]},
+            dtype="datetime64[ns]",
         )
+        if expected["valid_timestamp"].dt.tz is None:
+            expected["valid_timestamp"] = expected[
+                "valid_timestamp"
+            ].dt.tz_localize("UTC")
+        tm.assert_frame_equal(df, expected)
 
     def test_should_properly_handle_datetime_unix_epoch(self, project_id):
         query = 'SELECT DATETIME("1970-01-01 00:00:00") AS unix_epoch'
@@ -368,7 +372,7 @@ def test_should_properly_handle_arbitrary_datetime(self, project_id):
         "expression, is_expected_dtype",
         [
             ("current_date()", pandas.api.types.is_datetime64_ns_dtype),
-            ("current_timestamp()", pandas.api.types.is_datetime64_ns_dtype),
+            ("current_timestamp()", pandas.api.types.is_datetime64tz_dtype),
             ("current_datetime()", pandas.api.types.is_datetime64_ns_dtype),
             ("TRUE", pandas.api.types.is_bool_dtype),
             ("FALSE", pandas.api.types.is_bool_dtype),
@@ -402,9 +406,11 @@ def test_should_properly_handle_null_timestamp(self, project_id):
             credentials=self.credentials,
             dialect="legacy",
         )
-        tm.assert_frame_equal(
-            df, DataFrame({"null_timestamp": [NaT]}, dtype="datetime64[ns]")
+        expected = DataFrame({"null_timestamp": [NaT]}, dtype="datetime64[ns]")
+        expected["null_timestamp"] = expected["null_timestamp"].dt.tz_localize(
+            "UTC"
         )
+        tm.assert_frame_equal(df, expected)
 
     def test_should_properly_handle_null_datetime(self, project_id):
         query = "SELECT CAST(NULL AS DATETIME) AS null_datetime"
@@ -594,6 +600,7 @@ def test_zero_rows(self, project_id):
         expected_result = DataFrame(
             empty_columns, columns=["title", "id", "is_bot", "ts"]
         )
+        expected_result["ts"] = expected_result["ts"].dt.tz_localize("UTC")
         tm.assert_frame_equal(df, expected_result, check_index_type=False)
 
     def test_one_row_one_column(self, project_id):
diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py
@@ -1,21 +1,26 @@
 # -*- coding: utf-8 -*-
 
-import pandas.util.testing as tm
-import pytest
+try:
+    import mock
+except ImportError:  # pragma: NO COVER
+    from unittest import mock
+
 import numpy
 from pandas import DataFrame
+import pandas.util.testing as tm
+import pkg_resources
+import pytest
 
 import pandas_gbq.exceptions
 from pandas_gbq import gbq
 
-try:
-    import mock
-except ImportError:  # pragma: NO COVER
-    from unittest import mock
 
 pytestmark = pytest.mark.filter_warnings(
     "ignore:credentials from Google Cloud SDK"
 )
+pandas_installed_version = pkg_resources.get_distribution(
+    "pandas"
+).parsed_version
 
 
 @pytest.fixture
@@ -90,6 +95,7 @@ def no_auth(monkeypatch):
         ("INTEGER", None),  # Can't handle NULL
         ("BOOLEAN", None),  # Can't handle NULL
         ("FLOAT", numpy.dtype(float)),
+        # TIMESTAMP will be localized after DataFrame construction.
         ("TIMESTAMP", "datetime64[ns]"),
         ("DATETIME", "datetime64[ns]"),
     ],
@@ -200,6 +206,10 @@ def test_to_gbq_with_verbose_old_pandas_no_warnings(recwarn, min_bq_version):
         assert len(recwarn) == 0
 
 
+@pytest.mark.skipif(
+    pandas_installed_version < pkg_resources.parse_version("0.24.0"),
+    reason="Requires pandas 0.24+",
+)
 def test_to_gbq_with_private_key_new_pandas_warns_deprecation(
     min_bq_version, monkeypatch
 ):
@@ -413,6 +423,10 @@ def test_read_gbq_with_verbose_old_pandas_no_warnings(recwarn, min_bq_version):
         assert len(recwarn) == 0
 
 
+@pytest.mark.skipif(
+    pandas_installed_version < pkg_resources.parse_version("0.24.0"),
+    reason="Requires pandas 0.24+",
+)
 def test_read_gbq_with_private_key_new_pandas_warns_deprecation(
     min_bq_version, monkeypatch
 ):