diff --git a/data_diff/databases/base.py b/data_diff/databases/base.py index 871c650d..c5931979 100644 --- a/data_diff/databases/base.py +++ b/data_diff/databases/base.py @@ -800,6 +800,12 @@ def normalize_timestamp(self, value: str, coltype: TemporalType) -> str: Date format: ``YYYY-MM-DD HH:mm:SS.FFFFFF`` Precision of dates should be rounded up/down according to coltype.rounds + e.g. precision 3 and coltype.rounds: + - 1969-12-31 23:59:59.999999 -> 1970-01-01 00:00:00.000000 + - 1970-01-01 00:00:00.000888 -> 1970-01-01 00:00:00.001000 + - 1970-01-01 00:00:00.123123 -> 1970-01-01 00:00:00.123000 + + Make sure NULLs remain NULLs """ @abstractmethod diff --git a/data_diff/databases/postgresql.py b/data_diff/databases/postgresql.py index 4b9e945f..d29fa0eb 100644 --- a/data_diff/databases/postgresql.py +++ b/data_diff/databases/postgresql.py @@ -102,13 +102,40 @@ def md5_as_hex(self, s: str) -> str: return f"md5({s})" def normalize_timestamp(self, value: str, coltype: TemporalType) -> str: + def _add_padding(coltype: TemporalType, timestamp6: str): + return f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')" + if coltype.rounds: - return f"to_char({value}::timestamp({coltype.precision}), 'YYYY-mm-dd HH24:MI:SS.US')" + # NULL value expected to return NULL after normalization + null_case_begin = f"CASE WHEN {value} IS NULL THEN NULL ELSE " + null_case_end = "END" + + # 294277 or 4714 BC would be out of range, make sure we can't round to that + # TODO test timezones for overflow? + max_timestamp = "294276-12-31 23:59:59.0000" + min_timestamp = "4713-01-01 00:00:00.00 BC" + timestamp = f"least('{max_timestamp}'::timestamp(6), {value}::timestamp(6))" + timestamp = f"greatest('{min_timestamp}'::timestamp(6), {timestamp})" + + interval = format((0.5 * (10 ** (-coltype.precision))), f".{coltype.precision+1}f") + + rounded_timestamp = ( + f"left(to_char(least('{max_timestamp}'::timestamp, {timestamp})" + f"+ interval '{interval}', 'YYYY-mm-dd HH24:MI:SS.US')," + f"length(to_char(least('{max_timestamp}'::timestamp, {timestamp})" + f"+ interval '{interval}', 'YYYY-mm-dd HH24:MI:SS.US')) - (6-{coltype.precision}))" + ) - timestamp6 = f"to_char({value}::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')" - return ( - f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')" - ) + padded = _add_padding(coltype, rounded_timestamp) + return f"{null_case_begin} {padded} {null_case_end}" + + # TODO years with > 4 digits not padded correctly + # current w/ precision 6: 294276-12-31 23:59:59.0000 + # should be 294276-12-31 23:59:59.000000 + else: + rounded_timestamp = f"to_char({value}::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')" + padded = _add_padding(coltype, rounded_timestamp) + return padded def normalize_number(self, value: str, coltype: FractionalType) -> str: return self.to_string(f"{value}::decimal(38, {coltype.precision})") diff --git a/data_diff/databases/redshift.py b/data_diff/databases/redshift.py index dcf061c4..d31258e1 100644 --- a/data_diff/databases/redshift.py +++ b/data_diff/databases/redshift.py @@ -51,26 +51,6 @@ def md5_as_int(self, s: str) -> str: def md5_as_hex(self, s: str) -> str: return f"md5({s})" - def normalize_timestamp(self, value: str, coltype: TemporalType) -> str: - if coltype.rounds: - timestamp = f"{value}::timestamp(6)" - # Get seconds since epoch. Redshift doesn't support milli- or micro-seconds. - secs = f"timestamp 'epoch' + round(extract(epoch from {timestamp})::decimal(38)" - # Get the milliseconds from timestamp. - ms = f"extract(ms from {timestamp})" - # Get the microseconds from timestamp, without the milliseconds! - us = f"extract(us from {timestamp})" - # epoch = Total time since epoch in microseconds. - epoch = f"{secs}*1000000 + {ms}*1000 + {us}" - timestamp6 = ( - f"to_char({epoch}, -6+{coltype.precision}) * interval '0.000001 seconds', 'YYYY-mm-dd HH24:MI:SS.US')" - ) - else: - timestamp6 = f"to_char({value}::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')" - return ( - f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')" - ) - def normalize_number(self, value: str, coltype: FractionalType) -> str: return self.to_string(f"{value}::decimal(38,{coltype.precision})")