Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit f080ce7

Browse files
authored
Merge pull request #746 from datafold/fix-checksum-padding
add checksum offset to avoid bigint overflow
2 parents 9acbf6a + c0a52b7 commit f080ce7

File tree

15 files changed

+44
-14
lines changed

15 files changed

+44
-14
lines changed

data_diff/databases/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from data_diff.databases.base import MD5_HEXDIGITS, CHECKSUM_HEXDIGITS, QueryError, ConnectError, BaseDialect, Database
2+
from data_diff.databases.base import CHECKSUM_OFFSET
23
from data_diff.databases._connect import connect as connect
34
from data_diff.databases._connect import Connect as Connect
45
from data_diff.databases.postgresql import PostgreSQL as PostgreSQL

data_diff/databases/base.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1156,6 +1156,17 @@ def is_autocommit(self) -> bool:
11561156
_CHECKSUM_BITSIZE = CHECKSUM_HEXDIGITS << 2
11571157
CHECKSUM_MASK = (2**_CHECKSUM_BITSIZE) - 1
11581158

1159+
# bigint is typically 8 bytes
1160+
# if checksum is shorter, most databases will pad it with zeros
1161+
# 0xFF → 0x00000000000000FF;
1162+
# because of that, the numeric representation is always positive,
1163+
# which limits the number of checksums that we can add together before overflowing.
1164+
# we can fix that by adding a negative offset of half the max value,
1165+
# so that the distribution is from -0.5*max to +0.5*max.
1166+
# then negative numbers can compensate for the positive ones allowing to add more checksums together
1167+
# without overflowing.
1168+
CHECKSUM_OFFSET = CHECKSUM_MASK // 2
1169+
11591170
DEFAULT_DATETIME_PRECISION = 6
11601171
DEFAULT_NUMERIC_PRECISION = 24
11611172

data_diff/databases/bigquery.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@
3636
ConnectError,
3737
apply_query,
3838
QueryResult,
39+
CHECKSUM_OFFSET,
40+
CHECKSUM_HEXDIGITS,
41+
MD5_HEXDIGITS,
3942
)
4043
from data_diff.databases.base import TIMESTAMP_PRECISION_POS, ThreadLocalInterpreter, Mixin_RandomSample
4144

@@ -62,7 +65,7 @@ def import_bigquery_service_account_impersonation():
6265
@attrs.define(frozen=False)
6366
class Mixin_MD5(AbstractMixin_MD5):
6467
def md5_as_int(self, s: str) -> str:
65-
return f"cast(cast( ('0x' || substr(TO_HEX(md5({s})), 18)) as int64) as numeric)"
68+
return f"cast(cast( ('0x' || substr(TO_HEX(md5({s})), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS})) as int64) as numeric) - {CHECKSUM_OFFSET}"
6669

6770

6871
@attrs.define(frozen=False)

data_diff/databases/clickhouse.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
MD5_HEXDIGITS,
77
CHECKSUM_HEXDIGITS,
88
TIMESTAMP_PRECISION_POS,
9+
CHECKSUM_OFFSET,
910
BaseDialect,
1011
ThreadedDatabase,
1112
import_helper,
@@ -41,7 +42,9 @@ def import_clickhouse():
4142
class Mixin_MD5(AbstractMixin_MD5):
4243
def md5_as_int(self, s: str) -> str:
4344
substr_idx = 1 + MD5_HEXDIGITS - CHECKSUM_HEXDIGITS
44-
return f"reinterpretAsUInt128(reverse(unhex(lowerUTF8(substr(hex(MD5({s})), {substr_idx})))))"
45+
return (
46+
f"reinterpretAsUInt128(reverse(unhex(lowerUTF8(substr(hex(MD5({s})), {substr_idx}))))) - {CHECKSUM_OFFSET}"
47+
)
4548

4649

4750
@attrs.define(frozen=False)

data_diff/databases/databricks.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from data_diff.databases.base import (
2222
MD5_HEXDIGITS,
2323
CHECKSUM_HEXDIGITS,
24+
CHECKSUM_OFFSET,
2425
BaseDialect,
2526
ThreadedDatabase,
2627
import_helper,
@@ -39,7 +40,7 @@ def import_databricks():
3940
@attrs.define(frozen=False)
4041
class Mixin_MD5(AbstractMixin_MD5):
4142
def md5_as_int(self, s: str) -> str:
42-
return f"cast(conv(substr(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16, 10) as decimal(38, 0))"
43+
return f"cast(conv(substr(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16, 10) as decimal(38, 0)) - {CHECKSUM_OFFSET}"
4344

4445

4546
@attrs.define(frozen=False)

data_diff/databases/duckdb.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
ConnectError,
3030
ThreadLocalInterpreter,
3131
TIMESTAMP_PRECISION_POS,
32+
CHECKSUM_OFFSET,
3233
)
3334
from data_diff.databases.base import MD5_HEXDIGITS, CHECKSUM_HEXDIGITS, Mixin_Schema
3435
from data_diff.queries.ast_classes import Func, Compilable, ITable
@@ -45,7 +46,7 @@ def import_duckdb():
4546
@attrs.define(frozen=False)
4647
class Mixin_MD5(AbstractMixin_MD5):
4748
def md5_as_int(self, s: str) -> str:
48-
return f"('0x' || SUBSTRING(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS},{CHECKSUM_HEXDIGITS}))::BIGINT"
49+
return f"('0x' || SUBSTRING(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS},{CHECKSUM_HEXDIGITS}))::BIGINT - {CHECKSUM_OFFSET}"
4950

5051

5152
@attrs.define(frozen=False)

data_diff/databases/mssql.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
CHECKSUM_HEXDIGITS,
88
Mixin_OptimizerHints,
99
Mixin_RandomSample,
10+
CHECKSUM_OFFSET,
1011
QueryError,
1112
ThreadedDatabase,
1213
import_helper,
@@ -60,7 +61,7 @@ def normalize_number(self, value: str, coltype: FractionalType) -> str:
6061
@attrs.define(frozen=False)
6162
class Mixin_MD5(AbstractMixin_MD5):
6263
def md5_as_int(self, s: str) -> str:
63-
return f"convert(bigint, convert(varbinary, '0x' + RIGHT(CONVERT(NVARCHAR(32), HashBytes('MD5', {s}), 2), {CHECKSUM_HEXDIGITS}), 1))"
64+
return f"convert(bigint, convert(varbinary, '0x' + RIGHT(CONVERT(NVARCHAR(32), HashBytes('MD5', {s}), 2), {CHECKSUM_HEXDIGITS}), 1)) - {CHECKSUM_OFFSET}"
6465

6566

6667
@attrs.define(frozen=False)

data_diff/databases/mysql.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
MD5_HEXDIGITS,
3232
CHECKSUM_HEXDIGITS,
3333
TIMESTAMP_PRECISION_POS,
34+
CHECKSUM_OFFSET,
3435
Mixin_Schema,
3536
Mixin_RandomSample,
3637
)
@@ -47,7 +48,7 @@ def import_mysql():
4748
@attrs.define(frozen=False)
4849
class Mixin_MD5(AbstractMixin_MD5):
4950
def md5_as_int(self, s: str) -> str:
50-
return f"cast(conv(substring(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16, 10) as unsigned)"
51+
return f"conv(substring(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16, 10) - {CHECKSUM_OFFSET}"
5152

5253

5354
@attrs.define(frozen=False)

data_diff/databases/oracle.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@
2727
ConnectError,
2828
QueryError,
2929
Mixin_RandomSample,
30+
CHECKSUM_OFFSET,
31+
CHECKSUM_HEXDIGITS,
32+
MD5_HEXDIGITS,
3033
)
3134
from data_diff.databases.base import TIMESTAMP_PRECISION_POS
3235

@@ -45,7 +48,7 @@ class Mixin_MD5(AbstractMixin_MD5):
4548
def md5_as_int(self, s: str) -> str:
4649
# standard_hash is faster than DBMS_CRYPTO.Hash
4750
# TODO: Find a way to use UTL_RAW.CAST_TO_BINARY_INTEGER ?
48-
return f"to_number(substr(standard_hash({s}, 'MD5'), 18), 'xxxxxxxxxxxxxxx')"
51+
return f"to_number(substr(standard_hash({s}, 'MD5'), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 'xxxxxxxxxxxxxxx') - {CHECKSUM_OFFSET}"
4952

5053

5154
@attrs.define(frozen=False)

data_diff/databases/postgresql.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
CHECKSUM_HEXDIGITS,
2626
_CHECKSUM_BITSIZE,
2727
TIMESTAMP_PRECISION_POS,
28+
CHECKSUM_OFFSET,
2829
Mixin_RandomSample,
2930
)
3031

@@ -42,7 +43,7 @@ def import_postgresql():
4243
@attrs.define(frozen=False)
4344
class Mixin_MD5(AbstractMixin_MD5):
4445
def md5_as_int(self, s: str) -> str:
45-
return f"('x' || substring(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}))::bit({_CHECKSUM_BITSIZE})::bigint"
46+
return f"('x' || substring(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}))::bit({_CHECKSUM_BITSIZE})::bigint - {CHECKSUM_OFFSET}"
4647

4748

4849
@attrs.define(frozen=False)

data_diff/databases/presto.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from data_diff.databases.base import (
3434
MD5_HEXDIGITS,
3535
CHECKSUM_HEXDIGITS,
36+
CHECKSUM_OFFSET,
3637
TIMESTAMP_PRECISION_POS,
3738
)
3839

@@ -56,7 +57,7 @@ def import_presto():
5657
@attrs.define(frozen=False)
5758
class Mixin_MD5(AbstractMixin_MD5):
5859
def md5_as_int(self, s: str) -> str:
59-
return f"cast(from_base(substr(to_hex(md5(to_utf8({s}))), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16) as decimal(38, 0))"
60+
return f"cast(from_base(substr(to_hex(md5(to_utf8({s}))), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16) as decimal(38, 0)) - {CHECKSUM_OFFSET}"
6061

6162

6263
@attrs.define(frozen=False)

data_diff/databases/redshift.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
PostgreSQL,
1717
MD5_HEXDIGITS,
1818
CHECKSUM_HEXDIGITS,
19+
CHECKSUM_OFFSET,
1920
TIMESTAMP_PRECISION_POS,
2021
PostgresqlDialect,
2122
Mixin_NormalizeValue,
@@ -26,7 +27,7 @@
2627
@attrs.define(frozen=False)
2728
class Mixin_MD5(Mixin_MD5):
2829
def md5_as_int(self, s: str) -> str:
29-
return f"strtol(substring(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16)::decimal(38)"
30+
return f"strtol(substring(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16)::decimal(38) - {CHECKSUM_OFFSET}"
3031

3132

3233
@attrs.define(frozen=False)

data_diff/databases/snowflake.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import_helper,
3131
CHECKSUM_MASK,
3232
ThreadLocalInterpreter,
33+
CHECKSUM_OFFSET,
3334
Mixin_RandomSample,
3435
)
3536

@@ -46,7 +47,7 @@ def import_snowflake():
4647
@attrs.define(frozen=False)
4748
class Mixin_MD5(AbstractMixin_MD5):
4849
def md5_as_int(self, s: str) -> str:
49-
return f"BITAND(md5_number_lower64({s}), {CHECKSUM_MASK})"
50+
return f"BITAND(md5_number_lower64({s}), {CHECKSUM_MASK}) - {CHECKSUM_OFFSET}"
5051

5152

5253
@attrs.define(frozen=False)

data_diff/databases/vertica.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from data_diff.utils import match_regexps
66
from data_diff.databases.base import (
77
CHECKSUM_HEXDIGITS,
8+
CHECKSUM_OFFSET,
89
MD5_HEXDIGITS,
910
TIMESTAMP_PRECISION_POS,
1011
BaseDialect,
@@ -42,7 +43,7 @@ def import_vertica():
4243
@attrs.define(frozen=False)
4344
class Mixin_MD5(AbstractMixin_MD5):
4445
def md5_as_int(self, s: str) -> str:
45-
return f"CAST(HEX_TO_INTEGER(SUBSTRING(MD5({s}), {1 + MD5_HEXDIGITS - CHECKSUM_HEXDIGITS})) AS NUMERIC(38, 0))"
46+
return f"CAST(HEX_TO_INTEGER(SUBSTRING(MD5({s}), {1 + MD5_HEXDIGITS - CHECKSUM_HEXDIGITS})) AS NUMERIC(38, 0)) - {CHECKSUM_OFFSET}"
4647

4748

4849
@attrs.define(frozen=False)

tests/common.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,13 +124,13 @@ def str_to_checksum(str: str):
124124
# hello world
125125
# => 5eb63bbbe01eeed093cb22bb8f5acdc3
126126
# => cb22bb8f5acdc3
127-
# => 273350391345368515
127+
# => 273350391345368515 - offset (see db.CHECKSUM_OFFSET)
128128
m = hashlib.md5()
129129
m.update(str.encode("utf-8")) # encode to binary
130130
md5 = m.hexdigest()
131131
# 0-indexed, unlike DBs which are 1-indexed here, so +1 in dbs
132132
half_pos = db.MD5_HEXDIGITS - db.CHECKSUM_HEXDIGITS
133-
return int(md5[half_pos:], 16)
133+
return int(md5[half_pos:], 16) - db.CHECKSUM_OFFSET
134134

135135

136136
class DiffTestCase(unittest.TestCase):

0 commit comments

Comments
 (0)