diff --git a/data_diff/databases/__init__.py b/data_diff/databases/__init__.py index 842cc731..99fb4446 100644 --- a/data_diff/databases/__init__.py +++ b/data_diff/databases/__init__.py @@ -1,4 +1,5 @@ from data_diff.databases.base import MD5_HEXDIGITS, CHECKSUM_HEXDIGITS, QueryError, ConnectError, BaseDialect, Database +from data_diff.databases.base import CHECKSUM_OFFSET from data_diff.databases._connect import connect as connect from data_diff.databases._connect import Connect as Connect from data_diff.databases.postgresql import PostgreSQL as PostgreSQL diff --git a/data_diff/databases/base.py b/data_diff/databases/base.py index e4e215b7..f423e5a4 100644 --- a/data_diff/databases/base.py +++ b/data_diff/databases/base.py @@ -1156,6 +1156,17 @@ def is_autocommit(self) -> bool: _CHECKSUM_BITSIZE = CHECKSUM_HEXDIGITS << 2 CHECKSUM_MASK = (2**_CHECKSUM_BITSIZE) - 1 +# bigint is typically 8 bytes +# if checksum is shorter, most databases will pad it with zeros +# 0xFF → 0x00000000000000FF; +# because of that, the numeric representation is always positive, +# which limits the number of checksums that we can add together before overflowing. +# we can fix that by adding a negative offset of half the max value, +# so that the distribution is from -0.5*max to +0.5*max. +# then negative numbers can compensate for the positive ones allowing to add more checksums together +# without overflowing. +CHECKSUM_OFFSET = CHECKSUM_MASK // 2 + DEFAULT_DATETIME_PRECISION = 6 DEFAULT_NUMERIC_PRECISION = 24 diff --git a/data_diff/databases/bigquery.py b/data_diff/databases/bigquery.py index 5592bfae..f2cd10be 100644 --- a/data_diff/databases/bigquery.py +++ b/data_diff/databases/bigquery.py @@ -36,6 +36,9 @@ ConnectError, apply_query, QueryResult, + CHECKSUM_OFFSET, + CHECKSUM_HEXDIGITS, + MD5_HEXDIGITS, ) from data_diff.databases.base import TIMESTAMP_PRECISION_POS, ThreadLocalInterpreter, Mixin_RandomSample @@ -62,7 +65,7 @@ def import_bigquery_service_account_impersonation(): @attrs.define(frozen=False) class Mixin_MD5(AbstractMixin_MD5): def md5_as_int(self, s: str) -> str: - return f"cast(cast( ('0x' || substr(TO_HEX(md5({s})), 18)) as int64) as numeric)" + return f"cast(cast( ('0x' || substr(TO_HEX(md5({s})), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS})) as int64) as numeric) - {CHECKSUM_OFFSET}" @attrs.define(frozen=False) diff --git a/data_diff/databases/clickhouse.py b/data_diff/databases/clickhouse.py index 193a4b44..8700991d 100644 --- a/data_diff/databases/clickhouse.py +++ b/data_diff/databases/clickhouse.py @@ -6,6 +6,7 @@ MD5_HEXDIGITS, CHECKSUM_HEXDIGITS, TIMESTAMP_PRECISION_POS, + CHECKSUM_OFFSET, BaseDialect, ThreadedDatabase, import_helper, @@ -41,7 +42,9 @@ def import_clickhouse(): class Mixin_MD5(AbstractMixin_MD5): def md5_as_int(self, s: str) -> str: substr_idx = 1 + MD5_HEXDIGITS - CHECKSUM_HEXDIGITS - return f"reinterpretAsUInt128(reverse(unhex(lowerUTF8(substr(hex(MD5({s})), {substr_idx})))))" + return ( + f"reinterpretAsUInt128(reverse(unhex(lowerUTF8(substr(hex(MD5({s})), {substr_idx}))))) - {CHECKSUM_OFFSET}" + ) @attrs.define(frozen=False) diff --git a/data_diff/databases/databricks.py b/data_diff/databases/databricks.py index 96772766..246cd965 100644 --- a/data_diff/databases/databricks.py +++ b/data_diff/databases/databricks.py @@ -21,6 +21,7 @@ from data_diff.databases.base import ( MD5_HEXDIGITS, CHECKSUM_HEXDIGITS, + CHECKSUM_OFFSET, BaseDialect, ThreadedDatabase, import_helper, @@ -39,7 +40,7 @@ def import_databricks(): @attrs.define(frozen=False) class Mixin_MD5(AbstractMixin_MD5): def md5_as_int(self, s: str) -> str: - return f"cast(conv(substr(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16, 10) as decimal(38, 0))" + return f"cast(conv(substr(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16, 10) as decimal(38, 0)) - {CHECKSUM_OFFSET}" @attrs.define(frozen=False) diff --git a/data_diff/databases/duckdb.py b/data_diff/databases/duckdb.py index d70395a3..8922e62f 100644 --- a/data_diff/databases/duckdb.py +++ b/data_diff/databases/duckdb.py @@ -29,6 +29,7 @@ ConnectError, ThreadLocalInterpreter, TIMESTAMP_PRECISION_POS, + CHECKSUM_OFFSET, ) from data_diff.databases.base import MD5_HEXDIGITS, CHECKSUM_HEXDIGITS, Mixin_Schema from data_diff.queries.ast_classes import Func, Compilable, ITable @@ -45,7 +46,7 @@ def import_duckdb(): @attrs.define(frozen=False) class Mixin_MD5(AbstractMixin_MD5): def md5_as_int(self, s: str) -> str: - return f"('0x' || SUBSTRING(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS},{CHECKSUM_HEXDIGITS}))::BIGINT" + return f"('0x' || SUBSTRING(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS},{CHECKSUM_HEXDIGITS}))::BIGINT - {CHECKSUM_OFFSET}" @attrs.define(frozen=False) diff --git a/data_diff/databases/mssql.py b/data_diff/databases/mssql.py index e2845ec8..3123521d 100644 --- a/data_diff/databases/mssql.py +++ b/data_diff/databases/mssql.py @@ -7,6 +7,7 @@ CHECKSUM_HEXDIGITS, Mixin_OptimizerHints, Mixin_RandomSample, + CHECKSUM_OFFSET, QueryError, ThreadedDatabase, import_helper, @@ -60,7 +61,7 @@ def normalize_number(self, value: str, coltype: FractionalType) -> str: @attrs.define(frozen=False) class Mixin_MD5(AbstractMixin_MD5): def md5_as_int(self, s: str) -> str: - return f"convert(bigint, convert(varbinary, '0x' + RIGHT(CONVERT(NVARCHAR(32), HashBytes('MD5', {s}), 2), {CHECKSUM_HEXDIGITS}), 1))" + return f"convert(bigint, convert(varbinary, '0x' + RIGHT(CONVERT(NVARCHAR(32), HashBytes('MD5', {s}), 2), {CHECKSUM_HEXDIGITS}), 1)) - {CHECKSUM_OFFSET}" @attrs.define(frozen=False) diff --git a/data_diff/databases/mysql.py b/data_diff/databases/mysql.py index 29b26e2a..bee5db33 100644 --- a/data_diff/databases/mysql.py +++ b/data_diff/databases/mysql.py @@ -31,6 +31,7 @@ MD5_HEXDIGITS, CHECKSUM_HEXDIGITS, TIMESTAMP_PRECISION_POS, + CHECKSUM_OFFSET, Mixin_Schema, Mixin_RandomSample, ) @@ -47,7 +48,7 @@ def import_mysql(): @attrs.define(frozen=False) class Mixin_MD5(AbstractMixin_MD5): def md5_as_int(self, s: str) -> str: - return f"cast(conv(substring(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16, 10) as unsigned)" + return f"conv(substring(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16, 10) - {CHECKSUM_OFFSET}" @attrs.define(frozen=False) diff --git a/data_diff/databases/oracle.py b/data_diff/databases/oracle.py index c16b9271..260769b2 100644 --- a/data_diff/databases/oracle.py +++ b/data_diff/databases/oracle.py @@ -27,6 +27,9 @@ ConnectError, QueryError, Mixin_RandomSample, + CHECKSUM_OFFSET, + CHECKSUM_HEXDIGITS, + MD5_HEXDIGITS, ) from data_diff.databases.base import TIMESTAMP_PRECISION_POS @@ -45,7 +48,7 @@ class Mixin_MD5(AbstractMixin_MD5): def md5_as_int(self, s: str) -> str: # standard_hash is faster than DBMS_CRYPTO.Hash # TODO: Find a way to use UTL_RAW.CAST_TO_BINARY_INTEGER ? - return f"to_number(substr(standard_hash({s}, 'MD5'), 18), 'xxxxxxxxxxxxxxx')" + return f"to_number(substr(standard_hash({s}, 'MD5'), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 'xxxxxxxxxxxxxxx') - {CHECKSUM_OFFSET}" @attrs.define(frozen=False) diff --git a/data_diff/databases/postgresql.py b/data_diff/databases/postgresql.py index fa9de4a8..cd83b0e7 100644 --- a/data_diff/databases/postgresql.py +++ b/data_diff/databases/postgresql.py @@ -25,6 +25,7 @@ CHECKSUM_HEXDIGITS, _CHECKSUM_BITSIZE, TIMESTAMP_PRECISION_POS, + CHECKSUM_OFFSET, Mixin_RandomSample, ) @@ -42,7 +43,7 @@ def import_postgresql(): @attrs.define(frozen=False) class Mixin_MD5(AbstractMixin_MD5): def md5_as_int(self, s: str) -> str: - return f"('x' || substring(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}))::bit({_CHECKSUM_BITSIZE})::bigint" + return f"('x' || substring(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}))::bit({_CHECKSUM_BITSIZE})::bigint - {CHECKSUM_OFFSET}" @attrs.define(frozen=False) diff --git a/data_diff/databases/presto.py b/data_diff/databases/presto.py index 2aef9991..a819b29a 100644 --- a/data_diff/databases/presto.py +++ b/data_diff/databases/presto.py @@ -33,6 +33,7 @@ from data_diff.databases.base import ( MD5_HEXDIGITS, CHECKSUM_HEXDIGITS, + CHECKSUM_OFFSET, TIMESTAMP_PRECISION_POS, ) @@ -56,7 +57,7 @@ def import_presto(): @attrs.define(frozen=False) class Mixin_MD5(AbstractMixin_MD5): def md5_as_int(self, s: str) -> str: - return f"cast(from_base(substr(to_hex(md5(to_utf8({s}))), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16) as decimal(38, 0))" + return f"cast(from_base(substr(to_hex(md5(to_utf8({s}))), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16) as decimal(38, 0)) - {CHECKSUM_OFFSET}" @attrs.define(frozen=False) diff --git a/data_diff/databases/redshift.py b/data_diff/databases/redshift.py index b11ed3c8..61755e91 100644 --- a/data_diff/databases/redshift.py +++ b/data_diff/databases/redshift.py @@ -16,6 +16,7 @@ PostgreSQL, MD5_HEXDIGITS, CHECKSUM_HEXDIGITS, + CHECKSUM_OFFSET, TIMESTAMP_PRECISION_POS, PostgresqlDialect, Mixin_NormalizeValue, @@ -26,7 +27,7 @@ @attrs.define(frozen=False) class Mixin_MD5(Mixin_MD5): def md5_as_int(self, s: str) -> str: - return f"strtol(substring(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16)::decimal(38)" + return f"strtol(substring(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16)::decimal(38) - {CHECKSUM_OFFSET}" @attrs.define(frozen=False) diff --git a/data_diff/databases/snowflake.py b/data_diff/databases/snowflake.py index d83c0f40..4cc9853c 100644 --- a/data_diff/databases/snowflake.py +++ b/data_diff/databases/snowflake.py @@ -30,6 +30,7 @@ import_helper, CHECKSUM_MASK, ThreadLocalInterpreter, + CHECKSUM_OFFSET, Mixin_RandomSample, ) @@ -46,7 +47,7 @@ def import_snowflake(): @attrs.define(frozen=False) class Mixin_MD5(AbstractMixin_MD5): def md5_as_int(self, s: str) -> str: - return f"BITAND(md5_number_lower64({s}), {CHECKSUM_MASK})" + return f"BITAND(md5_number_lower64({s}), {CHECKSUM_MASK}) - {CHECKSUM_OFFSET}" @attrs.define(frozen=False) diff --git a/data_diff/databases/vertica.py b/data_diff/databases/vertica.py index dda4e1dd..e16c2b3b 100644 --- a/data_diff/databases/vertica.py +++ b/data_diff/databases/vertica.py @@ -5,6 +5,7 @@ from data_diff.utils import match_regexps from data_diff.databases.base import ( CHECKSUM_HEXDIGITS, + CHECKSUM_OFFSET, MD5_HEXDIGITS, TIMESTAMP_PRECISION_POS, BaseDialect, @@ -42,7 +43,7 @@ def import_vertica(): @attrs.define(frozen=False) class Mixin_MD5(AbstractMixin_MD5): def md5_as_int(self, s: str) -> str: - return f"CAST(HEX_TO_INTEGER(SUBSTRING(MD5({s}), {1 + MD5_HEXDIGITS - CHECKSUM_HEXDIGITS})) AS NUMERIC(38, 0))" + return f"CAST(HEX_TO_INTEGER(SUBSTRING(MD5({s}), {1 + MD5_HEXDIGITS - CHECKSUM_HEXDIGITS})) AS NUMERIC(38, 0)) - {CHECKSUM_OFFSET}" @attrs.define(frozen=False) diff --git a/tests/common.py b/tests/common.py index cca8e798..018f2e32 100644 --- a/tests/common.py +++ b/tests/common.py @@ -124,13 +124,13 @@ def str_to_checksum(str: str): # hello world # => 5eb63bbbe01eeed093cb22bb8f5acdc3 # => cb22bb8f5acdc3 - # => 273350391345368515 + # => 273350391345368515 - offset (see db.CHECKSUM_OFFSET) m = hashlib.md5() m.update(str.encode("utf-8")) # encode to binary md5 = m.hexdigest() # 0-indexed, unlike DBs which are 1-indexed here, so +1 in dbs half_pos = db.MD5_HEXDIGITS - db.CHECKSUM_HEXDIGITS - return int(md5[half_pos:], 16) + return int(md5[half_pos:], 16) - db.CHECKSUM_OFFSET class DiffTestCase(unittest.TestCase):