feat(redshift): Automatically add new DataFrame columns to Redshift tables during write operation (#2948)

jack-dell · web-flow · commit 5cb7a4de23da · 2024-09-11T10:27:38.000+01:00
* feat: Automatically add new columns to Redshift table during COPY operation

* feat: Automatically add new columns to Redshift table during COPY operation

* feat: Automatically add new columns to Redshift table during COPY operation

* fix: ruff formatting

* fix: ruff formatting

* fix: get redshift_types only if needed

* feat: Automatically add new columns to Redshift table during COPY operation

* feat: Automatically add new columns to Redshift table during COPY operation

* chore: code style

* chore: code style

* chore: code style

* feat: Automatically add new columns to Redshift table during COPY operation

* feat: Automatically add new columns to Redshift table during write operation

* feat: Automatically add new columns to Redshift table during write operation

* feat: Automatically add new columns to Redshift table during write operation

* feat: Automatically add new columns to Redshift table during write operation
diff --git a/awswrangler/redshift/_utils.py b/awswrangler/redshift/_utils.py
@@ -106,6 +106,27 @@ def _get_primary_keys(cursor: "redshift_connector.Cursor", schema: str, table: s
     return fields
 
 
+def _get_table_columns(cursor: "redshift_connector.Cursor", schema: str, table: str) -> list[str]:
+    sql = f"SELECT column_name FROM svv_columns\n WHERE table_schema = '{schema}' AND table_name = '{table}'"
+    _logger.debug("Executing select query:\n%s", sql)
+    cursor.execute(sql)
+    result: tuple[list[str]] = cursor.fetchall()
+    columns = ["".join(lst) for lst in result]
+    return columns
+
+
+def _add_table_columns(
+    cursor: "redshift_connector.Cursor", schema: str, table: str, new_columns: dict[str, str]
+) -> None:
+    for column_name, column_type in new_columns.items():
+        sql = (
+            f"ALTER TABLE {_identifier(schema)}.{_identifier(table)}"
+            f"\nADD COLUMN {_identifier(column_name)} {column_type};"
+        )
+        _logger.debug("Executing alter query:\n%s", sql)
+        cursor.execute(sql)
+
+
 def _does_table_exist(cursor: "redshift_connector.Cursor", schema: str | None, table: str) -> bool:
     schema_str = f"TABLE_SCHEMA = '{schema}' AND" if schema else ""
     sql = (
@@ -128,6 +149,16 @@ def _get_paths_from_manifest(path: str, boto3_session: boto3.Session | None = No
     return paths
 
 
+def _get_parameter_setting(cursor: "redshift_connector.Cursor", parameter_name: str) -> str:
+    sql = f"SHOW {parameter_name}"
+    _logger.debug("Executing select query:\n%s", sql)
+    cursor.execute(sql)
+    result = cursor.fetchall()
+    status = str(result[0][0])
+    _logger.debug(f"{parameter_name}='{status}'")
+    return status
+
+
 def _lock(
     cursor: "redshift_connector.Cursor",
     table_names: list[str],
@@ -267,7 +298,90 @@ def _redshift_types_from_path(
     return redshift_types
 
 
-def _create_table(  # noqa: PLR0912,PLR0913,PLR0915
+def _get_rsh_columns_types(
+    df: pd.DataFrame | None,
+    path: str | list[str] | None,
+    index: bool,
+    dtype: dict[str, str] | None,
+    varchar_lengths_default: int,
+    varchar_lengths: dict[str, int] | None,
+    data_format: Literal["parquet", "orc", "csv"] = "parquet",
+    redshift_column_types: dict[str, str] | None = None,
+    parquet_infer_sampling: float = 1.0,
+    path_suffix: str | None = None,
+    path_ignore_suffix: str | list[str] | None = None,
+    manifest: bool | None = False,
+    use_threads: bool | int = True,
+    boto3_session: boto3.Session | None = None,
+    s3_additional_kwargs: dict[str, str] | None = None,
+) -> dict[str, str]:
+    if df is not None:
+        redshift_types: dict[str, str] = _data_types.database_types_from_pandas(
+            df=df,
+            index=index,
+            dtype=dtype,
+            varchar_lengths_default=varchar_lengths_default,
+            varchar_lengths=varchar_lengths,
+            converter_func=_data_types.pyarrow2redshift,
+        )
+        _logger.debug("Converted redshift types from pandas: %s", redshift_types)
+    elif path is not None:
+        if manifest:
+            if not isinstance(path, str):
+                raise TypeError(
+                    f"""type: {type(path)} is not a valid type for 'path' when 'manifest' is set to True;
+                    must be a string"""
+                )
+            path = _get_paths_from_manifest(
+                path=path,
+                boto3_session=boto3_session,
+            )
+
+        if data_format in ["parquet", "orc"]:
+            redshift_types = _redshift_types_from_path(
+                path=path,
+                data_format=data_format,  # type: ignore[arg-type]
+                varchar_lengths_default=varchar_lengths_default,
+                varchar_lengths=varchar_lengths,
+                parquet_infer_sampling=parquet_infer_sampling,
+                path_suffix=path_suffix,
+                path_ignore_suffix=path_ignore_suffix,
+                use_threads=use_threads,
+                boto3_session=boto3_session,
+                s3_additional_kwargs=s3_additional_kwargs,
+            )
+        else:
+            if redshift_column_types is None:
+                raise ValueError(
+                    "redshift_column_types is None. It must be specified for files formats other than Parquet or ORC."
+                )
+            redshift_types = redshift_column_types
+    else:
+        raise ValueError("df and path are None. You MUST pass at least one.")
+    return redshift_types
+
+
+def _add_new_table_columns(
+    cursor: "redshift_connector.Cursor", schema: str, table: str, redshift_columns_types: dict[str, str]
+) -> None:
+    # Check if Redshift is configured as case sensitive or not
+    is_case_sensitive = False
+    if _get_parameter_setting(cursor=cursor, parameter_name="enable_case_sensitive_identifier").lower() in [
+        "on",
+        "true",
+    ]:
+        is_case_sensitive = True
+
+    # If it is case-insensitive, convert all the DataFrame columns names to lowercase before performing the comparison
+    if is_case_sensitive is False:
+        redshift_columns_types = {key.lower(): value for key, value in redshift_columns_types.items()}
+    actual_table_columns = set(_get_table_columns(cursor=cursor, schema=schema, table=table))
+    new_df_columns = {key: value for key, value in redshift_columns_types.items() if key not in actual_table_columns}
+
+    _add_table_columns(cursor=cursor, schema=schema, table=table, new_columns=new_df_columns)
+
+
+def _create_table(  # noqa: PLR0913
     df: pd.DataFrame | None,
     path: str | list[str] | None,
     con: "redshift_connector.Connection",
@@ -336,49 +450,24 @@ def _create_table(  # noqa: PLR0912,PLR0913,PLR0915
         return table, schema
     diststyle = diststyle.upper() if diststyle else "AUTO"
     sortstyle = sortstyle.upper() if sortstyle else "COMPOUND"
-    if df is not None:
-        redshift_types: dict[str, str] = _data_types.database_types_from_pandas(
-            df=df,
-            index=index,
-            dtype=dtype,
-            varchar_lengths_default=varchar_lengths_default,
-            varchar_lengths=varchar_lengths,
-            converter_func=_data_types.pyarrow2redshift,
-        )
-        _logger.debug("Converted redshift types from pandas: %s", redshift_types)
-    elif path is not None:
-        if manifest:
-            if not isinstance(path, str):
-                raise TypeError(
-                    f"""type: {type(path)} is not a valid type for 'path' when 'manifest' is set to True;
-                    must be a string"""
-                )
-            path = _get_paths_from_manifest(
-                path=path,
-                boto3_session=boto3_session,
-            )
 
-        if data_format in ["parquet", "orc"]:
-            redshift_types = _redshift_types_from_path(
-                path=path,
-                data_format=data_format,  # type: ignore[arg-type]
-                varchar_lengths_default=varchar_lengths_default,
-                varchar_lengths=varchar_lengths,
-                parquet_infer_sampling=parquet_infer_sampling,
-                path_suffix=path_suffix,
-                path_ignore_suffix=path_ignore_suffix,
-                use_threads=use_threads,
-                boto3_session=boto3_session,
-                s3_additional_kwargs=s3_additional_kwargs,
-            )
-        else:
-            if redshift_column_types is None:
-                raise ValueError(
-                    "redshift_column_types is None. It must be specified for files formats other than Parquet or ORC."
-                )
-            redshift_types = redshift_column_types
-    else:
-        raise ValueError("df and path are None. You MUST pass at least one.")
+    redshift_types = _get_rsh_columns_types(
+        df=df,
+        path=path,
+        index=index,
+        dtype=dtype,
+        varchar_lengths_default=varchar_lengths_default,
+        varchar_lengths=varchar_lengths,
+        parquet_infer_sampling=parquet_infer_sampling,
+        path_suffix=path_suffix,
+        path_ignore_suffix=path_ignore_suffix,
+        use_threads=use_threads,
+        boto3_session=boto3_session,
+        s3_additional_kwargs=s3_additional_kwargs,
+        data_format=data_format,
+        redshift_column_types=redshift_column_types,
+        manifest=manifest,
+    )
     _validate_parameters(
         redshift_types=redshift_types,
         diststyle=diststyle,
diff --git a/awswrangler/redshift/_write.py b/awswrangler/redshift/_write.py
@@ -13,7 +13,14 @@
 from awswrangler._config import apply_configs
 
 from ._connect import _validate_connection
-from ._utils import _create_table, _make_s3_auth_string, _upsert
+from ._utils import (
+    _add_new_table_columns,
+    _create_table,
+    _does_table_exist,
+    _get_rsh_columns_types,
+    _make_s3_auth_string,
+    _upsert,
+)
 
 if TYPE_CHECKING:
     try:
@@ -102,6 +109,7 @@ def to_sql(
     chunksize: int = 200,
     commit_transaction: bool = True,
     precombine_key: str | None = None,
+    add_new_columns: bool = False,
 ) -> None:
     """Write records stored in a DataFrame into Redshift.
 
@@ -169,6 +177,8 @@ def to_sql(
         When there is a primary_key match during upsert, this column will change the upsert method,
         comparing the values of the specified column from source and target, and keeping the
         larger of the two. Will only work when mode = upsert.
+    add_new_columns
+        If True, it automatically adds the new DataFrame columns into the target table.
 
     Examples
     --------
@@ -191,6 +201,19 @@ def to_sql(
     con.autocommit = False
     try:
         with con.cursor() as cursor:
+            if add_new_columns and _does_table_exist(cursor=cursor, schema=schema, table=table):
+                redshift_columns_types = _get_rsh_columns_types(
+                    df=df,
+                    path=None,
+                    index=index,
+                    dtype=dtype,
+                    varchar_lengths_default=varchar_lengths_default,
+                    varchar_lengths=varchar_lengths,
+                )
+                _add_new_table_columns(
+                    cursor=cursor, schema=schema, table=table, redshift_columns_types=redshift_columns_types
+                )
+
             created_table, created_schema = _create_table(
                 df=df,
                 path=None,
@@ -280,6 +303,7 @@ def copy_from_files(  # noqa: PLR0913
     s3_additional_kwargs: dict[str, str] | None = None,
     precombine_key: str | None = None,
     column_names: list[str] | None = None,
+    add_new_columns: bool = False,
 ) -> None:
     """Load files from S3 to a Table on Amazon Redshift (Through COPY command).
 
@@ -396,6 +420,8 @@ def copy_from_files(  # noqa: PLR0913
         larger of the two. Will only work when mode = upsert.
     column_names
         List of column names to map source data fields to the target columns.
+    add_new_columns
+        If True, it automatically adds the new DataFrame columns into the target table.
 
     Examples
     --------
@@ -420,6 +446,27 @@ def copy_from_files(  # noqa: PLR0913
     con.autocommit = False
     try:
         with con.cursor() as cursor:
+            if add_new_columns and _does_table_exist(cursor=cursor, schema=schema, table=table):
+                redshift_columns_types = _get_rsh_columns_types(
+                    df=None,
+                    path=path,
+                    index=False,
+                    dtype=None,
+                    varchar_lengths_default=varchar_lengths_default,
+                    varchar_lengths=varchar_lengths,
+                    parquet_infer_sampling=parquet_infer_sampling,
+                    path_suffix=path_suffix,
+                    path_ignore_suffix=path_ignore_suffix,
+                    use_threads=use_threads,
+                    boto3_session=boto3_session,
+                    s3_additional_kwargs=s3_additional_kwargs,
+                    data_format=data_format,
+                    redshift_column_types=redshift_column_types,
+                    manifest=manifest,
+                )
+                _add_new_table_columns(
+                    cursor=cursor, schema=schema, table=table, redshift_columns_types=redshift_columns_types
+                )
             created_table, created_schema = _create_table(
                 df=None,
                 path=path,
@@ -521,6 +568,7 @@ def copy(  # noqa: PLR0913
     max_rows_by_file: int | None = 10_000_000,
     precombine_key: str | None = None,
     use_column_names: bool = False,
+    add_new_columns: bool = False,
 ) -> None:
     """Load Pandas DataFrame as a Table on Amazon Redshift using parquet files on S3 as stage.
 
@@ -628,6 +676,8 @@ def copy(  # noqa: PLR0913
         If set to True, will use the column names of the DataFrame for generating the INSERT SQL Query.
         E.g. If the DataFrame has two columns `col1` and `col3` and `use_column_names` is True, data will only be
         inserted into the database columns `col1` and `col3`.
+    add_new_columns
+        If True, it automatically adds the new DataFrame columns into the target table.
 
     Examples
     --------
@@ -692,6 +742,7 @@ def copy(  # noqa: PLR0913
             sql_copy_extra_params=sql_copy_extra_params,
             precombine_key=precombine_key,
             column_names=column_names,
+            add_new_columns=add_new_columns,
         )
     finally:
         if keep_files is False:
diff --git a/tests/unit/test_redshift.py b/tests/unit/test_redshift.py