Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit a39cd0b

Browse files
author
Sergey Vasilyev
committed
Squash abstract MD5 & Normalizer mixins into the base dialect
The MD5 & normalizing methods are implemented in every supported database with 100% coverage. We have no databases that do not implement these methods. As such, they can be simply moved to the base dialect class to ensure the 100% coverage in the future. No changes are required from the specific dialect classes.
1 parent 87429de commit a39cd0b

File tree

14 files changed

+113
-153
lines changed

14 files changed

+113
-153
lines changed

data_diff/abcs/mixins.py

Lines changed: 0 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -19,104 +19,3 @@
1919
@attrs.define(frozen=False)
2020
class AbstractMixin(ABC):
2121
"A mixin for a database dialect"
22-
23-
24-
@attrs.define(frozen=False)
25-
class AbstractMixin_NormalizeValue(AbstractMixin):
26-
@abstractmethod
27-
def to_comparable(self, value: str, coltype: ColType) -> str:
28-
"""Ensure that the expression is comparable in ``IS DISTINCT FROM``."""
29-
30-
@abstractmethod
31-
def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
32-
"""Creates an SQL expression, that converts 'value' to a normalized timestamp.
33-
34-
The returned expression must accept any SQL datetime/timestamp, and return a string.
35-
36-
Date format: ``YYYY-MM-DD HH:mm:SS.FFFFFF``
37-
38-
Precision of dates should be rounded up/down according to coltype.rounds
39-
"""
40-
41-
@abstractmethod
42-
def normalize_number(self, value: str, coltype: FractionalType) -> str:
43-
"""Creates an SQL expression, that converts 'value' to a normalized number.
44-
45-
The returned expression must accept any SQL int/numeric/float, and return a string.
46-
47-
Floats/Decimals are expected in the format
48-
"I.P"
49-
50-
Where I is the integer part of the number (as many digits as necessary),
51-
and must be at least one digit (0).
52-
P is the fractional digits, the amount of which is specified with
53-
coltype.precision. Trailing zeroes may be necessary.
54-
If P is 0, the dot is omitted.
55-
56-
Note: We use 'precision' differently than most databases. For decimals,
57-
it's the same as ``numeric_scale``, and for floats, who use binary precision,
58-
it can be calculated as ``log10(2**numeric_precision)``.
59-
"""
60-
61-
def normalize_boolean(self, value: str, _coltype: Boolean) -> str:
62-
"""Creates an SQL expression, that converts 'value' to either '0' or '1'."""
63-
return self.to_string(value)
64-
65-
def normalize_uuid(self, value: str, coltype: ColType_UUID) -> str:
66-
"""Creates an SQL expression, that strips uuids of artifacts like whitespace."""
67-
if isinstance(coltype, String_UUID):
68-
return f"TRIM({value})"
69-
return self.to_string(value)
70-
71-
def normalize_json(self, value: str, _coltype: JSON) -> str:
72-
"""Creates an SQL expression, that converts 'value' to its minified json string representation."""
73-
return self.to_string(value)
74-
75-
def normalize_array(self, value: str, _coltype: Array) -> str:
76-
"""Creates an SQL expression, that serialized an array into a JSON string."""
77-
return self.to_string(value)
78-
79-
def normalize_struct(self, value: str, _coltype: Struct) -> str:
80-
"""Creates an SQL expression, that serialized a typed struct into a JSON string."""
81-
return self.to_string(value)
82-
83-
def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
84-
"""Creates an SQL expression, that converts 'value' to a normalized representation.
85-
86-
The returned expression must accept any SQL value, and return a string.
87-
88-
The default implementation dispatches to a method according to `coltype`:
89-
90-
::
91-
92-
TemporalType -> normalize_timestamp()
93-
FractionalType -> normalize_number()
94-
*else* -> to_string()
95-
96-
(`Integer` falls in the *else* category)
97-
98-
"""
99-
if isinstance(coltype, TemporalType):
100-
return self.normalize_timestamp(value, coltype)
101-
elif isinstance(coltype, FractionalType):
102-
return self.normalize_number(value, coltype)
103-
elif isinstance(coltype, ColType_UUID):
104-
return self.normalize_uuid(value, coltype)
105-
elif isinstance(coltype, Boolean):
106-
return self.normalize_boolean(value, coltype)
107-
elif isinstance(coltype, JSON):
108-
return self.normalize_json(value, coltype)
109-
elif isinstance(coltype, Array):
110-
return self.normalize_array(value, coltype)
111-
elif isinstance(coltype, Struct):
112-
return self.normalize_struct(value, coltype)
113-
return self.to_string(value)
114-
115-
116-
@attrs.define(frozen=False)
117-
class AbstractMixin_MD5(AbstractMixin):
118-
"""Methods for calculating an MD6 hash as an integer."""
119-
120-
@abstractmethod
121-
def md5_as_int(self, s: str) -> str:
122-
"Provide SQL for computing md5 and returning an int"

data_diff/databases/base.py

Lines changed: 92 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@
5555
)
5656
from data_diff.abcs.database_types import (
5757
Array,
58+
ColType_UUID,
59+
FractionalType,
5860
Struct,
5961
ColType,
6062
Integer,
@@ -74,7 +76,6 @@
7476
JSON,
7577
)
7678
from data_diff.abcs.mixins import Compilable
77-
from data_diff.abcs.mixins import AbstractMixin_NormalizeValue
7879

7980
logger = logging.getLogger("database")
8081
cv_params = contextvars.ContextVar("params")
@@ -762,6 +763,95 @@ def to_string(self, s: str) -> str:
762763
def set_timezone_to_utc(self) -> str:
763764
"Provide SQL for setting the session timezone to UTC"
764765

766+
@abstractmethod
767+
def md5_as_int(self, s: str) -> str:
768+
"Provide SQL for computing md5 and returning an int"
769+
770+
@abstractmethod
771+
def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
772+
"""Creates an SQL expression, that converts 'value' to a normalized timestamp.
773+
774+
The returned expression must accept any SQL datetime/timestamp, and return a string.
775+
776+
Date format: ``YYYY-MM-DD HH:mm:SS.FFFFFF``
777+
778+
Precision of dates should be rounded up/down according to coltype.rounds
779+
"""
780+
781+
@abstractmethod
782+
def normalize_number(self, value: str, coltype: FractionalType) -> str:
783+
"""Creates an SQL expression, that converts 'value' to a normalized number.
784+
785+
The returned expression must accept any SQL int/numeric/float, and return a string.
786+
787+
Floats/Decimals are expected in the format
788+
"I.P"
789+
790+
Where I is the integer part of the number (as many digits as necessary),
791+
and must be at least one digit (0).
792+
P is the fractional digits, the amount of which is specified with
793+
coltype.precision. Trailing zeroes may be necessary.
794+
If P is 0, the dot is omitted.
795+
796+
Note: We use 'precision' differently than most databases. For decimals,
797+
it's the same as ``numeric_scale``, and for floats, who use binary precision,
798+
it can be calculated as ``log10(2**numeric_precision)``.
799+
"""
800+
801+
def normalize_boolean(self, value: str, _coltype: Boolean) -> str:
802+
"""Creates an SQL expression, that converts 'value' to either '0' or '1'."""
803+
return self.to_string(value)
804+
805+
def normalize_uuid(self, value: str, coltype: ColType_UUID) -> str:
806+
"""Creates an SQL expression, that strips uuids of artifacts like whitespace."""
807+
if isinstance(coltype, String_UUID):
808+
return f"TRIM({value})"
809+
return self.to_string(value)
810+
811+
def normalize_json(self, value: str, _coltype: JSON) -> str:
812+
"""Creates an SQL expression, that converts 'value' to its minified json string representation."""
813+
return self.to_string(value)
814+
815+
def normalize_array(self, value: str, _coltype: Array) -> str:
816+
"""Creates an SQL expression, that serialized an array into a JSON string."""
817+
return self.to_string(value)
818+
819+
def normalize_struct(self, value: str, _coltype: Struct) -> str:
820+
"""Creates an SQL expression, that serialized a typed struct into a JSON string."""
821+
return self.to_string(value)
822+
823+
def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
824+
"""Creates an SQL expression, that converts 'value' to a normalized representation.
825+
826+
The returned expression must accept any SQL value, and return a string.
827+
828+
The default implementation dispatches to a method according to `coltype`:
829+
830+
::
831+
832+
TemporalType -> normalize_timestamp()
833+
FractionalType -> normalize_number()
834+
*else* -> to_string()
835+
836+
(`Integer` falls in the *else* category)
837+
838+
"""
839+
if isinstance(coltype, TemporalType):
840+
return self.normalize_timestamp(value, coltype)
841+
elif isinstance(coltype, FractionalType):
842+
return self.normalize_number(value, coltype)
843+
elif isinstance(coltype, ColType_UUID):
844+
return self.normalize_uuid(value, coltype)
845+
elif isinstance(coltype, Boolean):
846+
return self.normalize_boolean(value, coltype)
847+
elif isinstance(coltype, JSON):
848+
return self.normalize_json(value, coltype)
849+
elif isinstance(coltype, Array):
850+
return self.normalize_array(value, coltype)
851+
elif isinstance(coltype, Struct):
852+
return self.normalize_struct(value, coltype)
853+
return self.to_string(value)
854+
765855
def optimizer_hints(self, hints: str) -> str:
766856
return f"/*+ {hints} */ "
767857

@@ -960,10 +1050,7 @@ def _refine_coltypes(
9601050
if not text_columns:
9611051
return
9621052

963-
if isinstance(self.dialect, AbstractMixin_NormalizeValue):
964-
fields = [Code(self.dialect.normalize_uuid(self.dialect.quote(c), String_UUID())) for c in text_columns]
965-
else:
966-
fields = this[text_columns]
1053+
fields = [Code(self.dialect.normalize_uuid(self.dialect.quote(c), String_UUID())) for c in text_columns]
9671054

9681055
samples_by_row = self.query(
9691056
table(*table_path).select(*fields).where(Code(where) if where else SKIP).limit(sample_size), list

data_diff/databases/bigquery.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,6 @@
2020
Boolean,
2121
UnknownColType,
2222
)
23-
from data_diff.abcs.mixins import (
24-
AbstractMixin_MD5,
25-
AbstractMixin_NormalizeValue,
26-
)
27-
from data_diff.abcs.compiler import Compilable
28-
from data_diff.queries.api import this, table, SKIP, code
2923
from data_diff.databases.base import (
3024
BaseDialect,
3125
Database,
@@ -61,7 +55,7 @@ def import_bigquery_service_account_impersonation():
6155

6256

6357
@attrs.define(frozen=False)
64-
class Dialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
58+
class Dialect(BaseDialect):
6559
name = "BigQuery"
6660
ROUNDS_ON_PREC_LOSS = False # Technically BigQuery doesn't allow implicit rounding or truncation
6761
TYPE_CLASSES = {

data_diff/databases/clickhouse.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
Timestamp,
2525
Boolean,
2626
)
27-
from data_diff.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
2827

2928
# https://clickhouse.com/docs/en/operations/server-configuration-parameters/settings/#default-database
3029
DEFAULT_DATABASE = "default"
@@ -38,7 +37,7 @@ def import_clickhouse():
3837

3938

4039
@attrs.define(frozen=False)
41-
class Dialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
40+
class Dialect(BaseDialect):
4241
name = "Clickhouse"
4342
ROUNDS_ON_PREC_LOSS = False
4443
TYPE_CLASSES = {

data_diff/databases/databricks.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
UnknownColType,
1818
Boolean,
1919
)
20-
from data_diff.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
2120
from data_diff.databases.base import (
2221
MD5_HEXDIGITS,
2322
CHECKSUM_HEXDIGITS,
@@ -37,7 +36,7 @@ def import_databricks():
3736

3837

3938
@attrs.define(frozen=False)
40-
class Dialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
39+
class Dialect(BaseDialect):
4140
name = "Databricks"
4241
ROUNDS_ON_PREC_LOSS = True
4342
TYPE_CLASSES = {

data_diff/databases/duckdb.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,6 @@
1717
FractionalType,
1818
Boolean,
1919
)
20-
from data_diff.abcs.mixins import (
21-
AbstractMixin_MD5,
22-
AbstractMixin_NormalizeValue,
23-
)
2420
from data_diff.databases.base import (
2521
Database,
2622
BaseDialect,
@@ -41,7 +37,7 @@ def import_duckdb():
4137

4238

4339
@attrs.define(frozen=False)
44-
class Dialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
40+
class Dialect(BaseDialect):
4541
name = "DuckDB"
4642
ROUNDS_ON_PREC_LOSS = False
4743
SUPPORTS_PRIMARY_KEY = True

data_diff/databases/mssql.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import attrs
44

5-
from data_diff.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
65
from data_diff.databases.base import (
76
CHECKSUM_HEXDIGITS,
87
CHECKSUM_OFFSET,
@@ -36,11 +35,7 @@ def import_mssql():
3635

3736

3837
@attrs.define(frozen=False)
39-
class Dialect(
40-
BaseDialect,
41-
AbstractMixin_MD5,
42-
AbstractMixin_NormalizeValue,
43-
):
38+
class Dialect(BaseDialect):
4439
name = "MsSQL"
4540
ROUNDS_ON_PREC_LOSS = True
4641
SUPPORTS_PRIMARY_KEY = True

data_diff/databases/mysql.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,6 @@
1515
Boolean,
1616
Date,
1717
)
18-
from data_diff.abcs.mixins import (
19-
AbstractMixin_MD5,
20-
AbstractMixin_NormalizeValue,
21-
)
2218
from data_diff.databases.base import (
2319
ThreadedDatabase,
2420
import_helper,
@@ -41,11 +37,7 @@ def import_mysql():
4137

4238

4339
@attrs.define(frozen=False)
44-
class Dialect(
45-
BaseDialect,
46-
AbstractMixin_MD5,
47-
AbstractMixin_NormalizeValue,
48-
):
40+
class Dialect(BaseDialect):
4941
name = "MySQL"
5042
ROUNDS_ON_PREC_LOSS = True
5143
SUPPORTS_PRIMARY_KEY = True

data_diff/databases/oracle.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
TimestampTZ,
1717
FractionalType,
1818
)
19-
from data_diff.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
2019
from data_diff.databases.base import (
2120
BaseDialect,
2221
ThreadedDatabase,
@@ -42,8 +41,6 @@ def import_oracle():
4241
@attrs.define(frozen=False)
4342
class Dialect(
4443
BaseDialect,
45-
AbstractMixin_MD5,
46-
AbstractMixin_NormalizeValue,
4744
):
4845
name = "Oracle"
4946
SUPPORTS_PRIMARY_KEY = True

data_diff/databases/postgresql.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
Boolean,
1919
Date,
2020
)
21-
from data_diff.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
2221
from data_diff.databases.base import BaseDialect, ThreadedDatabase, import_helper, ConnectError
2322
from data_diff.databases.base import (
2423
MD5_HEXDIGITS,
@@ -40,7 +39,7 @@ def import_postgresql():
4039

4140

4241
@attrs.define(frozen=False)
43-
class PostgresqlDialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
42+
class PostgresqlDialect(BaseDialect):
4443
name = "PostgreSQL"
4544
ROUNDS_ON_PREC_LOSS = True
4645
SUPPORTS_PRIMARY_KEY = True

0 commit comments

Comments
 (0)