Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Handle full timestamp ranges for redshift, postgres #787

Merged
merged 1 commit into from
Nov 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions data_diff/databases/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -800,6 +800,12 @@ def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
Date format: ``YYYY-MM-DD HH:mm:SS.FFFFFF``

Precision of dates should be rounded up/down according to coltype.rounds
e.g. precision 3 and coltype.rounds:
- 1969-12-31 23:59:59.999999 -> 1970-01-01 00:00:00.000000
- 1970-01-01 00:00:00.000888 -> 1970-01-01 00:00:00.001000
- 1970-01-01 00:00:00.123123 -> 1970-01-01 00:00:00.123000

Make sure NULLs remain NULLs
"""

@abstractmethod
Expand Down
37 changes: 32 additions & 5 deletions data_diff/databases/postgresql.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,13 +102,40 @@ def md5_as_hex(self, s: str) -> str:
return f"md5({s})"

def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
def _add_padding(coltype: TemporalType, timestamp6: str):
return f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')"

if coltype.rounds:
return f"to_char({value}::timestamp({coltype.precision}), 'YYYY-mm-dd HH24:MI:SS.US')"
# NULL value expected to return NULL after normalization
null_case_begin = f"CASE WHEN {value} IS NULL THEN NULL ELSE "
null_case_end = "END"

# 294277 or 4714 BC would be out of range, make sure we can't round to that
# TODO test timezones for overflow?
max_timestamp = "294276-12-31 23:59:59.0000"
min_timestamp = "4713-01-01 00:00:00.00 BC"
timestamp = f"least('{max_timestamp}'::timestamp(6), {value}::timestamp(6))"
timestamp = f"greatest('{min_timestamp}'::timestamp(6), {timestamp})"

interval = format((0.5 * (10 ** (-coltype.precision))), f".{coltype.precision+1}f")

rounded_timestamp = (
f"left(to_char(least('{max_timestamp}'::timestamp, {timestamp})"
f"+ interval '{interval}', 'YYYY-mm-dd HH24:MI:SS.US'),"
f"length(to_char(least('{max_timestamp}'::timestamp, {timestamp})"
f"+ interval '{interval}', 'YYYY-mm-dd HH24:MI:SS.US')) - (6-{coltype.precision}))"
)

timestamp6 = f"to_char({value}::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
return (
f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')"
)
padded = _add_padding(coltype, rounded_timestamp)
return f"{null_case_begin} {padded} {null_case_end}"

# TODO years with > 4 digits not padded correctly
# current w/ precision 6: 294276-12-31 23:59:59.0000
# should be 294276-12-31 23:59:59.000000
else:
rounded_timestamp = f"to_char({value}::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
padded = _add_padding(coltype, rounded_timestamp)
return padded

def normalize_number(self, value: str, coltype: FractionalType) -> str:
return self.to_string(f"{value}::decimal(38, {coltype.precision})")
Expand Down
20 changes: 0 additions & 20 deletions data_diff/databases/redshift.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,26 +51,6 @@ def md5_as_int(self, s: str) -> str:
def md5_as_hex(self, s: str) -> str:
return f"md5({s})"

def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
if coltype.rounds:
timestamp = f"{value}::timestamp(6)"
# Get seconds since epoch. Redshift doesn't support milli- or micro-seconds.
secs = f"timestamp 'epoch' + round(extract(epoch from {timestamp})::decimal(38)"
# Get the milliseconds from timestamp.
ms = f"extract(ms from {timestamp})"
# Get the microseconds from timestamp, without the milliseconds!
us = f"extract(us from {timestamp})"
# epoch = Total time since epoch in microseconds.
epoch = f"{secs}*1000000 + {ms}*1000 + {us}"
timestamp6 = (
f"to_char({epoch}, -6+{coltype.precision}) * interval '0.000001 seconds', 'YYYY-mm-dd HH24:MI:SS.US')"
)
else:
timestamp6 = f"to_char({value}::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
return (
f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')"
)

def normalize_number(self, value: str, coltype: FractionalType) -> str:
return self.to_string(f"{value}::decimal(38,{coltype.precision})")

Expand Down