feat: Support different merge conditions in athena.to_iceberg function (#2861)

aldder · web-flow · commit 9a3842add695 · 2024-06-19T11:14:46.000+01:00
* added merge_condition to athena.to_iceberg

* fix query string

* fux ruff formatting
diff --git a/awswrangler/athena/_write_iceberg.py b/awswrangler/athena/_write_iceberg.py
@@ -213,6 +213,7 @@ def _validate_args(
     mode: Literal["append", "overwrite", "overwrite_partitions"],
     partition_cols: list[str] | None,
     merge_cols: list[str] | None,
+    merge_condition: Literal["update", "ignore"],
 ) -> None:
     if df.empty is True:
         raise exceptions.EmptyDataFrame("DataFrame cannot be empty.")
@@ -232,6 +233,11 @@ def _validate_args(
                 "When mode is 'overwrite_partitions' merge_cols must not be specified."
             )
 
+    if merge_cols and merge_condition not in ["update", "ignore"]:
+        raise exceptions.InvalidArgumentValue(
+            f"Invalid merge_condition: {merge_condition}. Valid values: ['update', 'ignore']"
+        )
+
 
 @apply_configs
 @_utils.validate_distributed_kwargs(
@@ -246,6 +252,7 @@ def to_iceberg(
     table_location: str | None = None,
     partition_cols: list[str] | None = None,
     merge_cols: list[str] | None = None,
+    merge_condition: Literal["update", "ignore"] = "update",
     keep_files: bool = True,
     data_source: str | None = None,
     s3_output: str | None = None,
@@ -292,6 +299,8 @@ def to_iceberg(
         List of column names that will be used for conditional inserts and updates.
 
         https://docs.aws.amazon.com/athena/latest/ug/merge-into-statement.html
+    merge_condition: str, optional
+        The condition to be used in the MERGE INTO statement. Valid values: ['update', 'ignore'].
     keep_files : bool
         Whether staging files produced by Athena are retained. 'True' by default.
     data_source : str, optional
@@ -376,6 +385,7 @@ def to_iceberg(
         mode=mode,
         partition_cols=partition_cols,
         merge_cols=merge_cols,
+        merge_condition=merge_condition,
     )
 
     glue_table_settings = cast(
@@ -497,12 +507,16 @@ def to_iceberg(
         # Insert or merge into Iceberg table
         sql_statement: str
         if merge_cols:
+            if merge_condition == "update":
+                match_condition = f"""WHEN MATCHED THEN
+                    UPDATE SET {', '.join([f'"{x}" = source."{x}"' for x in df.columns])}"""
+            else:
+                match_condition = ""
             sql_statement = f"""
                 MERGE INTO "{database}"."{table}" target
                 USING "{database}"."{temp_table}" source
                 ON {' AND '.join([f'target."{x}" = source."{x}"' for x in merge_cols])}
-                WHEN MATCHED THEN
-                    UPDATE SET {', '.join([f'"{x}" = source."{x}"' for x in df.columns])}
+                {match_condition}
                 WHEN NOT MATCHED THEN
                     INSERT ({', '.join([f'"{x}"' for x in df.columns])})
                     VALUES ({', '.join([f'source."{x}"' for x in df.columns])})
diff --git a/tests/unit/test_athena_iceberg.py b/tests/unit/test_athena_iceberg.py
@@ -650,6 +650,60 @@ def test_athena_to_iceberg_merge_into(path: str, path2: str, glue_database: str,
     assert_pandas_equals(df_expected, df_out)
 
 
+def test_athena_to_iceberg_merge_into_ignore(path: str, path2: str, glue_database: str, glue_table: str) -> None:
+    df = pd.DataFrame({"title": ["Dune", "Fargo"], "year": ["1984", "1996"], "gross": [35_000_000, 60_000_000]})
+    df["title"] = df["title"].astype("string")
+    df["year"] = df["year"].astype("string")
+    df["gross"] = df["gross"].astype("Int64")
+
+    wr.athena.to_iceberg(
+        df=df,
+        database=glue_database,
+        table=glue_table,
+        table_location=path,
+        temp_path=path2,
+        keep_files=False,
+    )
+
+    # Perform MERGE INTO
+    df2 = pd.DataFrame({"title": ["Dune", "Fargo"], "year": ["2021", "1996"], "gross": [400_000_000, 60_000_001]})
+    df2["title"] = df2["title"].astype("string")
+    df2["year"] = df2["year"].astype("string")
+    df2["gross"] = df2["gross"].astype("Int64")
+
+    wr.athena.to_iceberg(
+        df=df2,
+        database=glue_database,
+        table=glue_table,
+        table_location=path,
+        temp_path=path2,
+        keep_files=False,
+        merge_cols=["title", "year"],
+        merge_condition="ignore",
+    )
+
+    # Expected output
+    df_expected = pd.DataFrame(
+        {
+            "title": ["Dune", "Fargo", "Dune"],
+            "year": ["1984", "1996", "2021"],
+            "gross": [35_000_000, 60_000_000, 400_000_000],
+        }
+    )
+    df_expected["title"] = df_expected["title"].astype("string")
+    df_expected["year"] = df_expected["year"].astype("string")
+    df_expected["gross"] = df_expected["gross"].astype("Int64")
+
+    df_out = wr.athena.read_sql_query(
+        sql=f'SELECT * FROM "{glue_table}" ORDER BY year',
+        database=glue_database,
+        ctas_approach=False,
+        unload_approach=False,
+    )
+
+    assert_pandas_equals(df_expected, df_out)
+
+
 def test_athena_to_iceberg_cols_order(path: str, path2: str, glue_database: str, glue_table: str) -> None:
     kwargs = {
         "database": glue_database,