Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit b16252c

Browse files
authored
Merge pull request #787 from datafold/issue_786
Handle full timestamp ranges for redshift, postgres
2 parents 283bbac + 5e60fc1 commit b16252c

File tree

3 files changed

+38
-25
lines changed

3 files changed

+38
-25
lines changed

data_diff/databases/base.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -800,6 +800,12 @@ def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
800800
Date format: ``YYYY-MM-DD HH:mm:SS.FFFFFF``
801801
802802
Precision of dates should be rounded up/down according to coltype.rounds
803+
e.g. precision 3 and coltype.rounds:
804+
- 1969-12-31 23:59:59.999999 -> 1970-01-01 00:00:00.000000
805+
- 1970-01-01 00:00:00.000888 -> 1970-01-01 00:00:00.001000
806+
- 1970-01-01 00:00:00.123123 -> 1970-01-01 00:00:00.123000
807+
808+
Make sure NULLs remain NULLs
803809
"""
804810

805811
@abstractmethod

data_diff/databases/postgresql.py

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -102,13 +102,40 @@ def md5_as_hex(self, s: str) -> str:
102102
return f"md5({s})"
103103

104104
def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
105+
def _add_padding(coltype: TemporalType, timestamp6: str):
106+
return f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')"
107+
105108
if coltype.rounds:
106-
return f"to_char({value}::timestamp({coltype.precision}), 'YYYY-mm-dd HH24:MI:SS.US')"
109+
# NULL value expected to return NULL after normalization
110+
null_case_begin = f"CASE WHEN {value} IS NULL THEN NULL ELSE "
111+
null_case_end = "END"
112+
113+
# 294277 or 4714 BC would be out of range, make sure we can't round to that
114+
# TODO test timezones for overflow?
115+
max_timestamp = "294276-12-31 23:59:59.0000"
116+
min_timestamp = "4713-01-01 00:00:00.00 BC"
117+
timestamp = f"least('{max_timestamp}'::timestamp(6), {value}::timestamp(6))"
118+
timestamp = f"greatest('{min_timestamp}'::timestamp(6), {timestamp})"
119+
120+
interval = format((0.5 * (10 ** (-coltype.precision))), f".{coltype.precision+1}f")
121+
122+
rounded_timestamp = (
123+
f"left(to_char(least('{max_timestamp}'::timestamp, {timestamp})"
124+
f"+ interval '{interval}', 'YYYY-mm-dd HH24:MI:SS.US'),"
125+
f"length(to_char(least('{max_timestamp}'::timestamp, {timestamp})"
126+
f"+ interval '{interval}', 'YYYY-mm-dd HH24:MI:SS.US')) - (6-{coltype.precision}))"
127+
)
107128

108-
timestamp6 = f"to_char({value}::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
109-
return (
110-
f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')"
111-
)
129+
padded = _add_padding(coltype, rounded_timestamp)
130+
return f"{null_case_begin} {padded} {null_case_end}"
131+
132+
# TODO years with > 4 digits not padded correctly
133+
# current w/ precision 6: 294276-12-31 23:59:59.0000
134+
# should be 294276-12-31 23:59:59.000000
135+
else:
136+
rounded_timestamp = f"to_char({value}::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
137+
padded = _add_padding(coltype, rounded_timestamp)
138+
return padded
112139

113140
def normalize_number(self, value: str, coltype: FractionalType) -> str:
114141
return self.to_string(f"{value}::decimal(38, {coltype.precision})")

data_diff/databases/redshift.py

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -51,26 +51,6 @@ def md5_as_int(self, s: str) -> str:
5151
def md5_as_hex(self, s: str) -> str:
5252
return f"md5({s})"
5353

54-
def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
55-
if coltype.rounds:
56-
timestamp = f"{value}::timestamp(6)"
57-
# Get seconds since epoch. Redshift doesn't support milli- or micro-seconds.
58-
secs = f"timestamp 'epoch' + round(extract(epoch from {timestamp})::decimal(38)"
59-
# Get the milliseconds from timestamp.
60-
ms = f"extract(ms from {timestamp})"
61-
# Get the microseconds from timestamp, without the milliseconds!
62-
us = f"extract(us from {timestamp})"
63-
# epoch = Total time since epoch in microseconds.
64-
epoch = f"{secs}*1000000 + {ms}*1000 + {us}"
65-
timestamp6 = (
66-
f"to_char({epoch}, -6+{coltype.precision}) * interval '0.000001 seconds', 'YYYY-mm-dd HH24:MI:SS.US')"
67-
)
68-
else:
69-
timestamp6 = f"to_char({value}::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
70-
return (
71-
f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')"
72-
)
73-
7454
def normalize_number(self, value: str, coltype: FractionalType) -> str:
7555
return self.to_string(f"{value}::decimal(38,{coltype.precision})")
7656

0 commit comments

Comments
 (0)