Skip to content

Commit 439882c

Browse files
cournapeDavid Cournapeaukukushkingjaidisido
authored
fix: handle partitions with empty table in read_parquet with dataset=True (#2983)
* BUG: fix read_parquet with dataset=True when the first partition is empty. When reading a set of parquet files with dataset=True, if the first partition is empty the current logic for dtype inference will fail. It ill raise exceptions as follows: ``` pyarrow.lib.ArrowTypeError: Unable to merge: Field col0 has incompatible types: dictionary<values=null, indices=int32, ordered=0> vs dictionary<values=string, indices=int32, ordered=0 ``` To fix this, we filter out empty table(s) before merging them into one parquet file. * [style]: forgot to run ruff on the new code. * bug: fix the corner case where every table is empty. While that corner case was caughed in the full test suite, we add a mock test for this corner case for quick turnaround. --------- Co-authored-by: David Cournapeau <cournape@amazon.com> Co-authored-by: Anton Kukushkin <kukushkin.anton@gmail.com> Co-authored-by: jaidisido <jaidisido@gmail.com>
1 parent d396eea commit 439882c

File tree

2 files changed

+49
-0
lines changed

2 files changed

+49
-0
lines changed

awswrangler/s3/_read_parquet.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,14 @@ def _read_parquet(
311311
itertools.repeat(schema),
312312
itertools.repeat(decryption_properties),
313313
)
314+
# When the first table is empty in a dataset, the inferred schema may not
315+
# be compatible with the other tables, which will raise an exception when
316+
# concatening them down the line. As a workaround, we filter out empty
317+
# tables, unless every table is empty. In that latter case, the schemas
318+
# will be compatible so we do nothing in that case.
319+
should_filter_out = any(len(table) > 0 for table in tables)
320+
if should_filter_out:
321+
tables = [table for table in tables if len(table) > 0]
314322
return _utils.table_refs_to_df(tables, kwargs=arrow_kwargs)
315323

316324

tests/unit/test_moto.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -485,6 +485,47 @@ def test_s3_delete_object_success(moto_s3_client: "S3Client") -> None:
485485
wr.s3.read_parquet(path=path, dataset=True)
486486

487487

488+
@pytest.mark.parametrize("chunked", [True, False])
489+
def test_s3_parquet_empty_table(moto_s3_client: "S3Client", chunked) -> None:
490+
path = "s3://bucket/file.parquet"
491+
492+
r_df = pd.DataFrame({"id": []}, dtype=pd.Int64Dtype())
493+
wr.s3.to_parquet(df=r_df, path=path)
494+
495+
df = wr.s3.read_parquet(path, chunked=chunked)
496+
if chunked:
497+
df = pd.concat(list(df))
498+
499+
pd.testing.assert_frame_equal(r_df, df, check_dtype=True)
500+
501+
502+
def test_s3_dataset_empty_table(moto_s3_client: "S3Client") -> None:
503+
"""Test that a dataset split into multiple parquet files whose first
504+
partition is an empty table still loads properly.
505+
"""
506+
partition_col, partition_val = "col0", "1"
507+
dataset = f"{partition_col}={partition_val}"
508+
s3_key = f"s3://bucket/{dataset}"
509+
510+
dtypes = {"id": "string[python]"}
511+
df1 = pd.DataFrame({"id": []}).astype(dtypes)
512+
df2 = pd.DataFrame({"id": ["1"] * 2}).astype(dtypes)
513+
df3 = pd.DataFrame({"id": ["1"] * 3}).astype(dtypes)
514+
515+
dataframes = [df1, df2, df3]
516+
r_df = pd.concat(dataframes, ignore_index=True)
517+
r_df = r_df.assign(col0=pd.Categorical([partition_val] * len(r_df)))
518+
519+
for i, df in enumerate(dataframes):
520+
wr.s3.to_parquet(
521+
df=df,
522+
path=f"{s3_key}/part{i}.parquet",
523+
)
524+
525+
result_df = wr.s3.read_parquet(path=s3_key, dataset=True)
526+
pd.testing.assert_frame_equal(result_df, r_df, check_dtype=True)
527+
528+
488529
def test_s3_raise_delete_object_exception_success(moto_s3_client: "S3Client") -> None:
489530
path = "s3://bucket/test.parquet"
490531
wr.s3.to_parquet(df=get_df_list(), path=path, index=False, dataset=True, partition_cols=["par0", "par1"])

0 commit comments

Comments
 (0)