fix(athena): Enable use of dataframe type, in athena2pyarrow type (#2953)

eliabrio · web-flow · commit 119dc4e07fcd · 2024-09-10T09:55:35.000+01:00
* Enable use of dataframe type, in athena2pyarrow type

* Ruff format check

* Fix mypy error

* Add test to verify write was successful

* Fix test_save_dataframe_with_ms_units parameters

* Add use_threads parameter
diff --git a/awswrangler/_arrow.py b/awswrangler/_arrow.py
@@ -119,7 +119,7 @@ def _df_to_table(
         for col_name, col_type in dtype.items():
             if col_name in table.column_names:
                 col_index = table.column_names.index(col_name)
-                pyarrow_dtype = athena2pyarrow(col_type)
+                pyarrow_dtype = athena2pyarrow(col_type, df.dtypes.get(col_name))
                 field = pa.field(name=col_name, type=pyarrow_dtype)
                 table = table.set_column(col_index, field, table.column(col_name).cast(pyarrow_dtype))
                 _logger.debug("Casting column %s (%s) to %s (%s)", col_name, col_index, col_type, pyarrow_dtype)
diff --git a/awswrangler/_data_types.py b/awswrangler/_data_types.py
@@ -12,7 +12,6 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-import pyarrow.parquet
 
 from awswrangler import _arrow, exceptions
 from awswrangler._distributed import engine
@@ -306,7 +305,7 @@ def _split_map(s: str) -> list[str]:
     return parts
 
 
-def athena2pyarrow(dtype: str) -> pa.DataType:  # noqa: PLR0911,PLR0912
+def athena2pyarrow(dtype: str, df_type: str | None = None) -> pa.DataType:  # noqa: PLR0911,PLR0912
     """Athena to PyArrow data types conversion."""
     dtype = dtype.strip()
     if dtype.startswith(("array", "struct", "map")):
@@ -329,7 +328,16 @@ def athena2pyarrow(dtype: str) -> pa.DataType:  # noqa: PLR0911,PLR0912
     if (dtype in ("string", "uuid")) or dtype.startswith("char") or dtype.startswith("varchar"):
         return pa.string()
     if dtype == "timestamp":
-        return pa.timestamp(unit="ns")
+        if df_type == "datetime64[ns]":
+            return pa.timestamp(unit="ns")
+        elif df_type == "datetime64[us]":
+            return pa.timestamp(unit="us")
+        elif df_type == "datetime64[ms]":
+            return pa.timestamp(unit="ms")
+        elif df_type == "datetime64[s]":
+            return pa.timestamp(unit="s")
+        else:
+            return pa.timestamp(unit="ns")
     if dtype == "date":
         return pa.date32()
     if dtype in ("binary" or "varbinary"):
@@ -701,7 +709,7 @@ def pyarrow_schema_from_pandas(
     )
     for k, v in casts.items():
         if (k not in ignore) and (k in df.columns or _is_index_name(k, df.index)):
-            columns_types[k] = athena2pyarrow(dtype=v)
+            columns_types[k] = athena2pyarrow(dtype=v, df_type=df.dtypes.get(k))
     columns_types = {k: v for k, v in columns_types.items() if v is not None}
     _logger.debug("columns_types: %s", columns_types)
     return pa.schema(fields=columns_types)
diff --git a/tests/unit/test_s3_parquet.py b/tests/unit/test_s3_parquet.py
@@ -1032,3 +1032,39 @@ def test_read_from_access_point(access_point_path_path: str) -> None:
     wr.s3.to_parquet(df, path)
     df_out = wr.s3.read_parquet(path)
     assert df_out.shape == (3, 3)
+
+
+@pytest.mark.parametrize("use_threads", [True, False, 2])
+def test_save_dataframe_with_ms_units(path, glue_database, glue_table, use_threads):
+    df = pd.DataFrame(
+        {
+            "c0": [
+                "2023-01-01 00:00:00.000",
+                "2023-01-02 00:00:00.000",
+                "0800-01-01 00:00:00.000",  # Out-of-bounds timestamp
+                "2977-09-21 00:12:43.000",
+            ]
+        }
+    )
+
+    wr.s3.to_parquet(
+        df,
+        path,
+        dataset=True,
+        database=glue_database,
+        table=glue_table,
+        use_threads=use_threads,
+    )
+
+    # Saving exactly the same data twice. This ensures that even if the athena table exists, the flow of using its metadata
+    # to identify the schema of the data is working correctly.
+    wr.s3.to_parquet(
+        df,
+        path,
+        dataset=True,
+        database=glue_database,
+        table=glue_table,
+        use_threads=use_threads,
+    )
+    df_out = wr.s3.read_parquet_table(table=glue_table, database=glue_database)
+    assert df_out.shape == (8, 1)