diff --git a/pyiceberg/avro/writer.py b/pyiceberg/avro/writer.py index 80e96b04ad..954134f42c 100644 --- a/pyiceberg/avro/writer.py +++ b/pyiceberg/avro/writer.py @@ -32,6 +32,7 @@ List, Optional, Tuple, + Union, ) from uuid import UUID @@ -121,8 +122,22 @@ def write(self, encoder: BinaryEncoder, val: Any) -> None: @dataclass(frozen=True) class UUIDWriter(Writer): - def write(self, encoder: BinaryEncoder, val: UUID) -> None: - encoder.write(val.bytes) + def write(self, encoder: BinaryEncoder, val: Union[UUID, str, bytes]) -> None: + if isinstance(val, UUID): + encoder.write(val.bytes) + elif isinstance(val, bytes): + encoder.write(val) + elif isinstance(val, str): + if val.startswith("b'") and val.endswith("'"): + # Handle string representation of bytes + # Convert the escaped string to actual bytes + byte_string = val[2:-1].encode("utf-8").decode("unicode_escape").encode("latin1") + encoder.write(UUID(bytes=byte_string).bytes) + else: + # Regular UUID string + encoder.write(UUID(val).bytes) + else: + raise TypeError(f"Expected UUID, bytes, or string, got {type(val)}") @dataclass(frozen=True) diff --git a/pyiceberg/conversions.py b/pyiceberg/conversions.py index 7bf7b462e2..7b3dde2771 100644 --- a/pyiceberg/conversions.py +++ b/pyiceberg/conversions.py @@ -267,10 +267,13 @@ def _(_: StringType, value: str) -> bytes: @to_bytes.register(UUIDType) -def _(_: UUIDType, value: Union[uuid.UUID, bytes]) -> bytes: +def _(_: UUIDType, value: Union[uuid.UUID, bytes, str]) -> bytes: if isinstance(value, bytes): - return value - return value.bytes + return str(uuid.UUID(bytes=value)).encode(UTF8) + elif isinstance(value, uuid.UUID): + return str(value).encode(UTF8) + else: + return str(uuid.UUID(value)).encode(UTF8) @to_bytes.register(BinaryType) @@ -355,11 +358,15 @@ def _(_: StringType, b: bytes) -> str: @from_bytes.register(BinaryType) @from_bytes.register(FixedType) -@from_bytes.register(UUIDType) def _(_: PrimitiveType, b: bytes) -> bytes: return b +@from_bytes.register(UUIDType) +def _(_: UUIDType, b: bytes) -> uuid.UUID: + return uuid.UUID(b.decode(UTF8)) + + @from_bytes.register(DecimalType) def _(primitive_type: DecimalType, buf: bytes) -> Decimal: unscaled = int.from_bytes(buf, "big", signed=True) diff --git a/pyiceberg/expressions/literals.py b/pyiceberg/expressions/literals.py index 81e613d55a..1b28171072 100644 --- a/pyiceberg/expressions/literals.py +++ b/pyiceberg/expressions/literals.py @@ -144,7 +144,7 @@ def literal(value: L) -> Literal[L]: elif isinstance(value, str): return StringLiteral(value) elif isinstance(value, UUID): - return UUIDLiteral(value.bytes) # type: ignore + return UUIDLiteral(UUID(bytes=value.bytes)) elif isinstance(value, bytes): return BinaryLiteral(value) elif isinstance(value, Decimal): @@ -586,8 +586,8 @@ def _(self, _: TimestamptzType) -> Literal[int]: return TimestampLiteral(timestamptz_to_micros(self.value)) @to.register(UUIDType) - def _(self, _: UUIDType) -> Literal[bytes]: - return UUIDLiteral(UUID(self.value).bytes) + def _(self, _: UUIDType) -> Literal[UUID]: + return UUIDLiteral(UUID(self.value)) @to.register(DecimalType) def _(self, type_var: DecimalType) -> Literal[Decimal]: @@ -631,22 +631,22 @@ def __repr__(self) -> str: return f"literal({repr(self.value)})" -class UUIDLiteral(Literal[bytes]): - def __init__(self, value: bytes) -> None: - super().__init__(value, bytes) +class UUIDLiteral(Literal[UUID]): + def __init__(self, value: UUID) -> None: + super().__init__(value, UUID) @singledispatchmethod def to(self, type_var: IcebergType) -> Literal: # type: ignore raise TypeError(f"Cannot convert UUIDLiteral into {type_var}") @to.register(UUIDType) - def _(self, _: UUIDType) -> Literal[bytes]: + def _(self, _: UUIDType) -> Literal[UUID]: return self @to.register(FixedType) def _(self, type_var: FixedType) -> Literal[bytes]: if len(type_var) == UUID_BYTES_LENGTH: - return FixedLiteral(self.value) + return FixedLiteral(self.value.bytes) else: raise TypeError( f"Cannot convert UUIDLiteral into {type_var}, different length: {len(type_var)} <> {UUID_BYTES_LENGTH}" @@ -654,7 +654,7 @@ def _(self, type_var: FixedType) -> Literal[bytes]: @to.register(BinaryType) def _(self, _: BinaryType) -> Literal[bytes]: - return BinaryLiteral(self.value) + return BinaryLiteral(self.value.bytes) class FixedLiteral(Literal[bytes]): @@ -679,9 +679,9 @@ def _(self, _: BinaryType) -> Literal[bytes]: return BinaryLiteral(self.value) @to.register(UUIDType) - def _(self, type_var: UUIDType) -> Literal[bytes]: + def _(self, type_var: UUIDType) -> Literal[UUID]: if len(self.value) == UUID_BYTES_LENGTH: - return UUIDLiteral(self.value) + return UUIDLiteral(UUID(bytes=self.value)) else: raise TypeError( f"Could not convert {self.value!r} into a {type_var}, lengths differ {len(self.value)} <> {UUID_BYTES_LENGTH}" @@ -710,9 +710,9 @@ def _(self, type_var: FixedType) -> Literal[bytes]: ) @to.register(UUIDType) - def _(self, type_var: UUIDType) -> Literal[bytes]: + def _(self, type_var: UUIDType) -> Literal[UUID]: if len(self.value) == UUID_BYTES_LENGTH: - return UUIDLiteral(self.value) + return UUIDLiteral(UUID(bytes=self.value)) else: raise TypeError( f"Cannot convert BinaryLiteral into {type_var}, different length: {UUID_BYTES_LENGTH} <> {len(self.value)}" diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py index df07f94342..b28848b650 100644 --- a/pyiceberg/partitioning.py +++ b/pyiceberg/partitioning.py @@ -467,8 +467,17 @@ def _(type: IcebergType, value: Optional[time]) -> Optional[int]: @_to_partition_representation.register(UUIDType) -def _(type: IcebergType, value: Optional[uuid.UUID]) -> Optional[str]: - return str(value) if value is not None else None +def _(type: IcebergType, value: Optional[Union[uuid.UUID, int, bytes]]) -> Optional[Union[str, int]]: + if value is None: + return None + elif isinstance(value, bytes): + return str(uuid.UUID(bytes=value)) # IdentityTransform + elif isinstance(value, uuid.UUID): + return str(value) # IdentityTransform + elif isinstance(value, int): + return value # BucketTransform + else: + raise ValueError(f"Type not recognized: {value}") @_to_partition_representation.register(PrimitiveType) diff --git a/tests/expressions/test_literals.py b/tests/expressions/test_literals.py index 4d8f5557f6..d9a1ccb565 100644 --- a/tests/expressions/test_literals.py +++ b/tests/expressions/test_literals.py @@ -373,7 +373,7 @@ def test_string_to_uuid_literal() -> None: uuid_str = literal(str(expected)) uuid_lit = uuid_str.to(UUIDType()) - assert expected.bytes == uuid_lit.value + assert uuid_lit.value == expected def test_string_to_decimal_literal() -> None: @@ -530,8 +530,7 @@ def test_binary_to_uuid() -> None: lit = literal(test_uuid.bytes) uuid_lit = lit.to(UUIDType()) assert uuid_lit is not None - assert lit.value == uuid_lit.value - assert uuid_lit.value == test_uuid.bytes + assert uuid_lit.value == test_uuid def test_incompatible_binary_to_uuid() -> None: @@ -560,8 +559,7 @@ def test_fixed_to_uuid() -> None: lit = literal(test_uuid.bytes).to(FixedType(16)) uuid_lit = lit.to(UUIDType()) assert uuid_lit is not None - assert lit.value == uuid_lit.value - assert uuid_lit.value == test_uuid.bytes + assert uuid_lit.value == test_uuid def test_incompatible_fixed_to_uuid() -> None: @@ -900,7 +898,7 @@ def test_uuid_literal_initialization() -> None: test_uuid = uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7") uuid_literal = literal(test_uuid) assert isinstance(uuid_literal, Literal) - assert test_uuid.bytes == uuid_literal.value + assert test_uuid == uuid_literal.value def test_uuid_to_fixed() -> None: diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py index 150d2b750c..96c3a9f1be 100644 --- a/tests/integration/test_writes/test_writes.py +++ b/tests/integration/test_writes/test_writes.py @@ -19,6 +19,7 @@ import os import random import time +import uuid from datetime import date, datetime, timedelta from decimal import Decimal from pathlib import Path @@ -48,7 +49,7 @@ from pyiceberg.schema import Schema from pyiceberg.table import TableProperties from pyiceberg.table.sorting import SortDirection, SortField, SortOrder -from pyiceberg.transforms import DayTransform, HourTransform, IdentityTransform +from pyiceberg.transforms import BucketTransform, DayTransform, HourTransform, IdentityTransform from pyiceberg.types import ( DateType, DecimalType, @@ -58,6 +59,7 @@ LongType, NestedField, StringType, + UUIDType, ) from utils import _create_table @@ -1841,3 +1843,56 @@ def test_read_write_decimals(session_catalog: Catalog) -> None: tbl.append(arrow_table) assert tbl.scan().to_arrow() == arrow_table + + +@pytest.mark.integration +def test_read_write_uuids_partitioned(session_catalog: Catalog) -> None: + """Test simple reading and writing partitioned UUID data types in supported transform. + - BucketTransform + - IdentityTransform + """ + + identifier = "default.test_read_write_uuids" + uuids = [ + uuid.UUID("ec9b663b-062f-4200-a130-8de19c21b800").bytes, + uuid.UUID("5f473c64-dbeb-449b-bdfa-b6b4185b1bde").bytes, + None, + ] + + arrow_table = pa.Table.from_pydict( + { + "uuid_1": pa.array(uuids, type=pa.binary(16)), + "uuid_2": pa.array(uuids, type=pa.binary(16)), + } + ) + + tbl = _create_table( + session_catalog, + identifier, + properties={"format-version": 2}, + schema=Schema( + NestedField(field_id=1, name="uuid_1", field_type=UUIDType(), required=False), + NestedField(field_id=2, name="uuid_2", field_type=UUIDType(), required=False), + ), + partition_spec=PartitionSpec( + PartitionField(source_id=1, field_id=1001, transform=BucketTransform(2), name="uuid_bucket"), + PartitionField(source_id=2, field_id=1002, transform=IdentityTransform(), name="uuid_indentity"), + ), + ) + + tbl.append(arrow_table) + assert tbl.scan().to_arrow() == arrow_table + # Check BucketTransform partitioning filtering + assert tbl.scan(row_filter=f"uuid_1 == '{uuid.UUID(bytes=uuids[0])}'").to_arrow() == pa.Table.from_pydict( + { + "uuid_1": pa.array([uuids[0]], type=pa.binary(16)), + "uuid_2": pa.array([uuids[0]], type=pa.binary(16)), + } + ) + # Check IdentityTransform partitioning filtering + assert tbl.scan(row_filter=f"uuid_2 == '{uuid.UUID(bytes=uuids[1])}'").to_arrow() == pa.Table.from_pydict( + { + "uuid_1": pa.array([uuids[1]], type=pa.binary(16)), + "uuid_2": pa.array([uuids[1]], type=pa.binary(16)), + } + ) diff --git a/tests/io/test_pyarrow_stats.py b/tests/io/test_pyarrow_stats.py index 7a4d47317a..424ffab4fe 100644 --- a/tests/io/test_pyarrow_stats.py +++ b/tests/io/test_pyarrow_stats.py @@ -66,6 +66,7 @@ TableMetadataV1, TableMetadataV2, ) +from pyiceberg.typedef import UTF8 from pyiceberg.types import ( BooleanType, FloatType, @@ -537,7 +538,7 @@ def test_metrics_primitive_types() -> None: assert datafile.lower_bounds[8] == STRUCT_INT64.pack(datetime_to_micros(datetime(2022, 1, 2, 17, 30, 34, 399))) assert datafile.lower_bounds[9] == STRUCT_INT64.pack(datetime_to_micros(datetime(2022, 1, 2, 17, 30, 34, 399, tz))) assert datafile.lower_bounds[10] == b"he" - assert datafile.lower_bounds[11] == uuid.uuid3(uuid.NAMESPACE_DNS, "foo").bytes + assert datafile.lower_bounds[11] == str(uuid.uuid3(uuid.NAMESPACE_DNS, "foo")).encode(UTF8) assert datafile.lower_bounds[12] == b"he" assert datafile.lower_bounds[13][::-1].ljust(4, b"\x00") == STRUCT_INT32.pack(12345) assert datafile.lower_bounds[14][::-1].ljust(8, b"\x00") == STRUCT_INT64.pack(12345679123456) @@ -554,7 +555,7 @@ def test_metrics_primitive_types() -> None: assert datafile.upper_bounds[8] == STRUCT_INT64.pack(datetime_to_micros(datetime(2023, 2, 4, 13, 21, 4, 354))) assert datafile.upper_bounds[9] == STRUCT_INT64.pack(datetime_to_micros(datetime(2023, 2, 4, 13, 21, 4, 354, tz))) assert datafile.upper_bounds[10] == b"wp" - assert datafile.upper_bounds[11] == uuid.uuid3(uuid.NAMESPACE_DNS, "bar").bytes + assert datafile.upper_bounds[11] == str(uuid.uuid3(uuid.NAMESPACE_DNS, "bar")).encode(UTF8) assert datafile.upper_bounds[12] == b"wp" assert datafile.upper_bounds[13][::-1].ljust(4, b"\x00") == STRUCT_INT32.pack(67891) assert datafile.upper_bounds[14][::-1].ljust(8, b"\x00") == STRUCT_INT64.pack(67891234678912) diff --git a/tests/table/test_partitioning.py b/tests/table/test_partitioning.py index 57ab3e328a..d674651dcb 100644 --- a/tests/table/test_partitioning.py +++ b/tests/table/test_partitioning.py @@ -21,7 +21,7 @@ import pytest -from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionSpec +from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionFieldValue, PartitionKey, PartitionSpec from pyiceberg.schema import Schema from pyiceberg.transforms import ( BucketTransform, @@ -29,6 +29,7 @@ HourTransform, IdentityTransform, MonthTransform, + Transform, TruncateTransform, YearTransform, ) @@ -217,6 +218,41 @@ def test_transform_consistency_with_pyarrow_transform(source_type: PrimitiveType raise +@pytest.mark.parametrize( + "source_type, _transform, input_value,expected_value", + [ + (UUIDType(), BucketTransform(2), UUID("ec9b663b-062f-4200-a130-8de19c21b800").bytes, 0), + ( + UUIDType(), + IdentityTransform(), + UUID("ec9b663b-062f-4200-a130-8de19c21b800"), + UUID("ec9b663b-062f-4200-a130-8de19c21b800"), + ), + (UUIDType(), TruncateTransform(1), UUID("ec9b663b-062f-4200-a130-8de19c21b800").bytes, None), + ], +) +def test_transform_uuid_partition_key( + source_type: PrimitiveType, _transform: Transform[Any, Any], input_value: UUID, expected_value: Any +) -> None: + """ + Tests that UUID values can be correctly transformed and used as partition keys with various transformation functions. + """ + schema = Schema(NestedField(field_id=1, name="uuid", field_type=source_type, required=True)) + partition_field = PartitionField(source_id=1, field_id=1001, transform=_transform, name="uuid_partition") + spec = PartitionSpec(partition_field) + + if _transform.can_transform(source_type): + transformer = _transform.transform(source=source_type) + + value = transformer(input_value) + assert value == expected_value + + partition_field_value = PartitionFieldValue(field=partition_field, value=value) + partition_key = PartitionKey(field_values=[partition_field_value], partition_spec=spec, schema=schema) + assert partition_key.field_values[0].field == partition_field + assert partition_key.field_values[0].value == expected_value + + def test_deserialize_partition_field_v2() -> None: json_partition_spec = """{"source-id": 1, "field-id": 1000, "transform": "truncate[19]", "name": "str_truncate"}""" diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 2ee0ba3dd9..97cce774e1 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -278,10 +278,10 @@ def test_partition_to_py_raise_on_incorrect_precision_or_scale( (StringType(), b"foo", "foo"), ( UUIDType(), - b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7", - b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7", + b"f79c3e09-677c-4bbd-a479-3f349cb785e7", + uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), ), - (UUIDType(), b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7", b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7"), + (UUIDType(), b"f79c3e09-677c-4bbd-a479-3f349cb785e7", uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7")), (FixedType(3), b"foo", b"foo"), (BinaryType(), b"foo", b"foo"), (DecimalType(5, 2), b"\x30\x39", Decimal("123.45")), @@ -316,10 +316,10 @@ def test_from_bytes(primitive_type: PrimitiveType, b: bytes, result: Any) -> Non (StringType(), b"foo", "foo"), ( UUIDType(), - b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7", - b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7", + b"f79c3e09-677c-4bbd-a479-3f349cb785e7", + uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), ), - (UUIDType(), b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7", b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7"), + (UUIDType(), b"f79c3e09-677c-4bbd-a479-3f349cb785e7", uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7")), (FixedType(3), b"foo", b"foo"), (BinaryType(), b"foo", b"foo"), (DecimalType(5, 2), b"\x30\x39", Decimal("123.45")), @@ -356,9 +356,9 @@ def test_round_trip_conversion(primitive_type: PrimitiveType, b: bytes, result: ( UUIDType(), uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), - b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7", + b"f79c3e09-677c-4bbd-a479-3f349cb785e7", ), - (UUIDType(), uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7"), + (UUIDType(), uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), b"f79c3e09-677c-4bbd-a479-3f349cb785e7"), ], ) def test_uuid_to_bytes(primitive_type: PrimitiveType, v: Any, result: bytes) -> None: