Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

add checksum offset to avoid bigint overflow #746

Merged
merged 6 commits into from
Oct 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions data_diff/databases/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from data_diff.databases.base import MD5_HEXDIGITS, CHECKSUM_HEXDIGITS, QueryError, ConnectError, BaseDialect, Database
from data_diff.databases.base import CHECKSUM_OFFSET
from data_diff.databases._connect import connect as connect
from data_diff.databases._connect import Connect as Connect
from data_diff.databases.postgresql import PostgreSQL as PostgreSQL
Expand Down
11 changes: 11 additions & 0 deletions data_diff/databases/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1156,6 +1156,17 @@ def is_autocommit(self) -> bool:
_CHECKSUM_BITSIZE = CHECKSUM_HEXDIGITS << 2
CHECKSUM_MASK = (2**_CHECKSUM_BITSIZE) - 1

# bigint is typically 8 bytes
# if checksum is shorter, most databases will pad it with zeros
# 0xFF → 0x00000000000000FF;
# because of that, the numeric representation is always positive,
# which limits the number of checksums that we can add together before overflowing.
# we can fix that by adding a negative offset of half the max value,
# so that the distribution is from -0.5*max to +0.5*max.
# then negative numbers can compensate for the positive ones allowing to add more checksums together
# without overflowing.
CHECKSUM_OFFSET = CHECKSUM_MASK // 2

DEFAULT_DATETIME_PRECISION = 6
DEFAULT_NUMERIC_PRECISION = 24

Expand Down
5 changes: 4 additions & 1 deletion data_diff/databases/bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@
ConnectError,
apply_query,
QueryResult,
CHECKSUM_OFFSET,
CHECKSUM_HEXDIGITS,
MD5_HEXDIGITS,
)
from data_diff.databases.base import TIMESTAMP_PRECISION_POS, ThreadLocalInterpreter, Mixin_RandomSample

Expand All @@ -62,7 +65,7 @@ def import_bigquery_service_account_impersonation():
@attrs.define(frozen=False)
class Mixin_MD5(AbstractMixin_MD5):
def md5_as_int(self, s: str) -> str:
return f"cast(cast( ('0x' || substr(TO_HEX(md5({s})), 18)) as int64) as numeric)"
return f"cast(cast( ('0x' || substr(TO_HEX(md5({s})), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS})) as int64) as numeric) - {CHECKSUM_OFFSET}"


@attrs.define(frozen=False)
Expand Down
5 changes: 4 additions & 1 deletion data_diff/databases/clickhouse.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
MD5_HEXDIGITS,
CHECKSUM_HEXDIGITS,
TIMESTAMP_PRECISION_POS,
CHECKSUM_OFFSET,
BaseDialect,
ThreadedDatabase,
import_helper,
Expand Down Expand Up @@ -41,7 +42,9 @@ def import_clickhouse():
class Mixin_MD5(AbstractMixin_MD5):
def md5_as_int(self, s: str) -> str:
substr_idx = 1 + MD5_HEXDIGITS - CHECKSUM_HEXDIGITS
return f"reinterpretAsUInt128(reverse(unhex(lowerUTF8(substr(hex(MD5({s})), {substr_idx})))))"
return (
f"reinterpretAsUInt128(reverse(unhex(lowerUTF8(substr(hex(MD5({s})), {substr_idx}))))) - {CHECKSUM_OFFSET}"
)


@attrs.define(frozen=False)
Expand Down
3 changes: 2 additions & 1 deletion data_diff/databases/databricks.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from data_diff.databases.base import (
MD5_HEXDIGITS,
CHECKSUM_HEXDIGITS,
CHECKSUM_OFFSET,
BaseDialect,
ThreadedDatabase,
import_helper,
Expand All @@ -39,7 +40,7 @@ def import_databricks():
@attrs.define(frozen=False)
class Mixin_MD5(AbstractMixin_MD5):
def md5_as_int(self, s: str) -> str:
return f"cast(conv(substr(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16, 10) as decimal(38, 0))"
return f"cast(conv(substr(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16, 10) as decimal(38, 0)) - {CHECKSUM_OFFSET}"


@attrs.define(frozen=False)
Expand Down
3 changes: 2 additions & 1 deletion data_diff/databases/duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
ConnectError,
ThreadLocalInterpreter,
TIMESTAMP_PRECISION_POS,
CHECKSUM_OFFSET,
)
from data_diff.databases.base import MD5_HEXDIGITS, CHECKSUM_HEXDIGITS, Mixin_Schema
from data_diff.queries.ast_classes import Func, Compilable, ITable
Expand All @@ -45,7 +46,7 @@ def import_duckdb():
@attrs.define(frozen=False)
class Mixin_MD5(AbstractMixin_MD5):
def md5_as_int(self, s: str) -> str:
return f"('0x' || SUBSTRING(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS},{CHECKSUM_HEXDIGITS}))::BIGINT"
return f"('0x' || SUBSTRING(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS},{CHECKSUM_HEXDIGITS}))::BIGINT - {CHECKSUM_OFFSET}"


@attrs.define(frozen=False)
Expand Down
3 changes: 2 additions & 1 deletion data_diff/databases/mssql.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
CHECKSUM_HEXDIGITS,
Mixin_OptimizerHints,
Mixin_RandomSample,
CHECKSUM_OFFSET,
QueryError,
ThreadedDatabase,
import_helper,
Expand Down Expand Up @@ -60,7 +61,7 @@ def normalize_number(self, value: str, coltype: FractionalType) -> str:
@attrs.define(frozen=False)
class Mixin_MD5(AbstractMixin_MD5):
def md5_as_int(self, s: str) -> str:
return f"convert(bigint, convert(varbinary, '0x' + RIGHT(CONVERT(NVARCHAR(32), HashBytes('MD5', {s}), 2), {CHECKSUM_HEXDIGITS}), 1))"
return f"convert(bigint, convert(varbinary, '0x' + RIGHT(CONVERT(NVARCHAR(32), HashBytes('MD5', {s}), 2), {CHECKSUM_HEXDIGITS}), 1)) - {CHECKSUM_OFFSET}"


@attrs.define(frozen=False)
Expand Down
3 changes: 2 additions & 1 deletion data_diff/databases/mysql.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
MD5_HEXDIGITS,
CHECKSUM_HEXDIGITS,
TIMESTAMP_PRECISION_POS,
CHECKSUM_OFFSET,
Mixin_Schema,
Mixin_RandomSample,
)
Expand All @@ -47,7 +48,7 @@ def import_mysql():
@attrs.define(frozen=False)
class Mixin_MD5(AbstractMixin_MD5):
def md5_as_int(self, s: str) -> str:
return f"cast(conv(substring(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16, 10) as unsigned)"
return f"conv(substring(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16, 10) - {CHECKSUM_OFFSET}"


@attrs.define(frozen=False)
Expand Down
5 changes: 4 additions & 1 deletion data_diff/databases/oracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
ConnectError,
QueryError,
Mixin_RandomSample,
CHECKSUM_OFFSET,
CHECKSUM_HEXDIGITS,
MD5_HEXDIGITS,
)
from data_diff.databases.base import TIMESTAMP_PRECISION_POS

Expand All @@ -45,7 +48,7 @@ class Mixin_MD5(AbstractMixin_MD5):
def md5_as_int(self, s: str) -> str:
# standard_hash is faster than DBMS_CRYPTO.Hash
# TODO: Find a way to use UTL_RAW.CAST_TO_BINARY_INTEGER ?
return f"to_number(substr(standard_hash({s}, 'MD5'), 18), 'xxxxxxxxxxxxxxx')"
return f"to_number(substr(standard_hash({s}, 'MD5'), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 'xxxxxxxxxxxxxxx') - {CHECKSUM_OFFSET}"


@attrs.define(frozen=False)
Expand Down
3 changes: 2 additions & 1 deletion data_diff/databases/postgresql.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
CHECKSUM_HEXDIGITS,
_CHECKSUM_BITSIZE,
TIMESTAMP_PRECISION_POS,
CHECKSUM_OFFSET,
Mixin_RandomSample,
)

Expand All @@ -42,7 +43,7 @@ def import_postgresql():
@attrs.define(frozen=False)
class Mixin_MD5(AbstractMixin_MD5):
def md5_as_int(self, s: str) -> str:
return f"('x' || substring(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}))::bit({_CHECKSUM_BITSIZE})::bigint"
return f"('x' || substring(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}))::bit({_CHECKSUM_BITSIZE})::bigint - {CHECKSUM_OFFSET}"


@attrs.define(frozen=False)
Expand Down
3 changes: 2 additions & 1 deletion data_diff/databases/presto.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from data_diff.databases.base import (
MD5_HEXDIGITS,
CHECKSUM_HEXDIGITS,
CHECKSUM_OFFSET,
TIMESTAMP_PRECISION_POS,
)

Expand All @@ -56,7 +57,7 @@ def import_presto():
@attrs.define(frozen=False)
class Mixin_MD5(AbstractMixin_MD5):
def md5_as_int(self, s: str) -> str:
return f"cast(from_base(substr(to_hex(md5(to_utf8({s}))), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16) as decimal(38, 0))"
return f"cast(from_base(substr(to_hex(md5(to_utf8({s}))), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16) as decimal(38, 0)) - {CHECKSUM_OFFSET}"


@attrs.define(frozen=False)
Expand Down
3 changes: 2 additions & 1 deletion data_diff/databases/redshift.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
PostgreSQL,
MD5_HEXDIGITS,
CHECKSUM_HEXDIGITS,
CHECKSUM_OFFSET,
TIMESTAMP_PRECISION_POS,
PostgresqlDialect,
Mixin_NormalizeValue,
Expand All @@ -26,7 +27,7 @@
@attrs.define(frozen=False)
class Mixin_MD5(Mixin_MD5):
def md5_as_int(self, s: str) -> str:
return f"strtol(substring(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16)::decimal(38)"
return f"strtol(substring(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16)::decimal(38) - {CHECKSUM_OFFSET}"


@attrs.define(frozen=False)
Expand Down
3 changes: 2 additions & 1 deletion data_diff/databases/snowflake.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import_helper,
CHECKSUM_MASK,
ThreadLocalInterpreter,
CHECKSUM_OFFSET,
Mixin_RandomSample,
)

Expand All @@ -46,7 +47,7 @@ def import_snowflake():
@attrs.define(frozen=False)
class Mixin_MD5(AbstractMixin_MD5):
def md5_as_int(self, s: str) -> str:
return f"BITAND(md5_number_lower64({s}), {CHECKSUM_MASK})"
return f"BITAND(md5_number_lower64({s}), {CHECKSUM_MASK}) - {CHECKSUM_OFFSET}"


@attrs.define(frozen=False)
Expand Down
3 changes: 2 additions & 1 deletion data_diff/databases/vertica.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from data_diff.utils import match_regexps
from data_diff.databases.base import (
CHECKSUM_HEXDIGITS,
CHECKSUM_OFFSET,
MD5_HEXDIGITS,
TIMESTAMP_PRECISION_POS,
BaseDialect,
Expand Down Expand Up @@ -42,7 +43,7 @@ def import_vertica():
@attrs.define(frozen=False)
class Mixin_MD5(AbstractMixin_MD5):
def md5_as_int(self, s: str) -> str:
return f"CAST(HEX_TO_INTEGER(SUBSTRING(MD5({s}), {1 + MD5_HEXDIGITS - CHECKSUM_HEXDIGITS})) AS NUMERIC(38, 0))"
return f"CAST(HEX_TO_INTEGER(SUBSTRING(MD5({s}), {1 + MD5_HEXDIGITS - CHECKSUM_HEXDIGITS})) AS NUMERIC(38, 0)) - {CHECKSUM_OFFSET}"


@attrs.define(frozen=False)
Expand Down
4 changes: 2 additions & 2 deletions tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,13 +124,13 @@ def str_to_checksum(str: str):
# hello world
# => 5eb63bbbe01eeed093cb22bb8f5acdc3
# => cb22bb8f5acdc3
# => 273350391345368515
# => 273350391345368515 - offset (see db.CHECKSUM_OFFSET)
m = hashlib.md5()
m.update(str.encode("utf-8")) # encode to binary
md5 = m.hexdigest()
# 0-indexed, unlike DBs which are 1-indexed here, so +1 in dbs
half_pos = db.MD5_HEXDIGITS - db.CHECKSUM_HEXDIGITS
return int(md5[half_pos:], 16)
return int(md5[half_pos:], 16) - db.CHECKSUM_OFFSET


class DiffTestCase(unittest.TestCase):
Expand Down