Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit b694825

Browse files
author
Sergey Vasilyev
committed
Squash abstract MD5 & Normalizer mixins into the base dialect
The MD5 & normalizing methods are implemented in every supported database with 100% coverage. We have no databases that do not implement these methods. As such, they can be simply moved to the base dialect class to ensure the 100% coverage in the future. No changes are required from the specific dialect classes.
1 parent accc5dc commit b694825

File tree

13 files changed

+104
-153
lines changed

13 files changed

+104
-153
lines changed

data_diff/abcs/mixins.py

Lines changed: 0 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -19,104 +19,3 @@
1919
@attrs.define(frozen=False)
2020
class AbstractMixin(ABC):
2121
"A mixin for a database dialect"
22-
23-
24-
@attrs.define(frozen=False)
25-
class AbstractMixin_NormalizeValue(AbstractMixin):
26-
@abstractmethod
27-
def to_comparable(self, value: str, coltype: ColType) -> str:
28-
"""Ensure that the expression is comparable in ``IS DISTINCT FROM``."""
29-
30-
@abstractmethod
31-
def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
32-
"""Creates an SQL expression, that converts 'value' to a normalized timestamp.
33-
34-
The returned expression must accept any SQL datetime/timestamp, and return a string.
35-
36-
Date format: ``YYYY-MM-DD HH:mm:SS.FFFFFF``
37-
38-
Precision of dates should be rounded up/down according to coltype.rounds
39-
"""
40-
41-
@abstractmethod
42-
def normalize_number(self, value: str, coltype: FractionalType) -> str:
43-
"""Creates an SQL expression, that converts 'value' to a normalized number.
44-
45-
The returned expression must accept any SQL int/numeric/float, and return a string.
46-
47-
Floats/Decimals are expected in the format
48-
"I.P"
49-
50-
Where I is the integer part of the number (as many digits as necessary),
51-
and must be at least one digit (0).
52-
P is the fractional digits, the amount of which is specified with
53-
coltype.precision. Trailing zeroes may be necessary.
54-
If P is 0, the dot is omitted.
55-
56-
Note: We use 'precision' differently than most databases. For decimals,
57-
it's the same as ``numeric_scale``, and for floats, who use binary precision,
58-
it can be calculated as ``log10(2**numeric_precision)``.
59-
"""
60-
61-
def normalize_boolean(self, value: str, _coltype: Boolean) -> str:
62-
"""Creates an SQL expression, that converts 'value' to either '0' or '1'."""
63-
return self.to_string(value)
64-
65-
def normalize_uuid(self, value: str, coltype: ColType_UUID) -> str:
66-
"""Creates an SQL expression, that strips uuids of artifacts like whitespace."""
67-
if isinstance(coltype, String_UUID):
68-
return f"TRIM({value})"
69-
return self.to_string(value)
70-
71-
def normalize_json(self, value: str, _coltype: JSON) -> str:
72-
"""Creates an SQL expression, that converts 'value' to its minified json string representation."""
73-
return self.to_string(value)
74-
75-
def normalize_array(self, value: str, _coltype: Array) -> str:
76-
"""Creates an SQL expression, that serialized an array into a JSON string."""
77-
return self.to_string(value)
78-
79-
def normalize_struct(self, value: str, _coltype: Struct) -> str:
80-
"""Creates an SQL expression, that serialized a typed struct into a JSON string."""
81-
return self.to_string(value)
82-
83-
def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
84-
"""Creates an SQL expression, that converts 'value' to a normalized representation.
85-
86-
The returned expression must accept any SQL value, and return a string.
87-
88-
The default implementation dispatches to a method according to `coltype`:
89-
90-
::
91-
92-
TemporalType -> normalize_timestamp()
93-
FractionalType -> normalize_number()
94-
*else* -> to_string()
95-
96-
(`Integer` falls in the *else* category)
97-
98-
"""
99-
if isinstance(coltype, TemporalType):
100-
return self.normalize_timestamp(value, coltype)
101-
elif isinstance(coltype, FractionalType):
102-
return self.normalize_number(value, coltype)
103-
elif isinstance(coltype, ColType_UUID):
104-
return self.normalize_uuid(value, coltype)
105-
elif isinstance(coltype, Boolean):
106-
return self.normalize_boolean(value, coltype)
107-
elif isinstance(coltype, JSON):
108-
return self.normalize_json(value, coltype)
109-
elif isinstance(coltype, Array):
110-
return self.normalize_array(value, coltype)
111-
elif isinstance(coltype, Struct):
112-
return self.normalize_struct(value, coltype)
113-
return self.to_string(value)
114-
115-
116-
@attrs.define(frozen=False)
117-
class AbstractMixin_MD5(AbstractMixin):
118-
"""Methods for calculating an MD6 hash as an integer."""
119-
120-
@abstractmethod
121-
def md5_as_int(self, s: str) -> str:
122-
"Provide SQL for computing md5 and returning an int"

data_diff/databases/base.py

Lines changed: 94 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,6 @@
7474
JSON,
7575
)
7676
from data_diff.abcs.mixins import Compilable
77-
from data_diff.abcs.mixins import AbstractMixin_NormalizeValue
7877

7978
logger = logging.getLogger("database")
8079
cv_params = contextvars.ContextVar("params")
@@ -762,6 +761,99 @@ def to_string(self, s: str) -> str:
762761
def set_timezone_to_utc(self) -> str:
763762
"Provide SQL for setting the session timezone to UTC"
764763

764+
@abstractmethod
765+
def md5_as_int(self, s: str) -> str:
766+
"Provide SQL for computing md5 and returning an int"
767+
768+
@abstractmethod
769+
def to_comparable(self, value: str, coltype: ColType) -> str:
770+
"""Ensure that the expression is comparable in ``IS DISTINCT FROM``."""
771+
772+
@abstractmethod
773+
def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
774+
"""Creates an SQL expression, that converts 'value' to a normalized timestamp.
775+
776+
The returned expression must accept any SQL datetime/timestamp, and return a string.
777+
778+
Date format: ``YYYY-MM-DD HH:mm:SS.FFFFFF``
779+
780+
Precision of dates should be rounded up/down according to coltype.rounds
781+
"""
782+
783+
@abstractmethod
784+
def normalize_number(self, value: str, coltype: FractionalType) -> str:
785+
"""Creates an SQL expression, that converts 'value' to a normalized number.
786+
787+
The returned expression must accept any SQL int/numeric/float, and return a string.
788+
789+
Floats/Decimals are expected in the format
790+
"I.P"
791+
792+
Where I is the integer part of the number (as many digits as necessary),
793+
and must be at least one digit (0).
794+
P is the fractional digits, the amount of which is specified with
795+
coltype.precision. Trailing zeroes may be necessary.
796+
If P is 0, the dot is omitted.
797+
798+
Note: We use 'precision' differently than most databases. For decimals,
799+
it's the same as ``numeric_scale``, and for floats, who use binary precision,
800+
it can be calculated as ``log10(2**numeric_precision)``.
801+
"""
802+
803+
def normalize_boolean(self, value: str, _coltype: Boolean) -> str:
804+
"""Creates an SQL expression, that converts 'value' to either '0' or '1'."""
805+
return self.to_string(value)
806+
807+
def normalize_uuid(self, value: str, coltype: ColType_UUID) -> str:
808+
"""Creates an SQL expression, that strips uuids of artifacts like whitespace."""
809+
if isinstance(coltype, String_UUID):
810+
return f"TRIM({value})"
811+
return self.to_string(value)
812+
813+
def normalize_json(self, value: str, _coltype: JSON) -> str:
814+
"""Creates an SQL expression, that converts 'value' to its minified json string representation."""
815+
return self.to_string(value)
816+
817+
def normalize_array(self, value: str, _coltype: Array) -> str:
818+
"""Creates an SQL expression, that serialized an array into a JSON string."""
819+
return self.to_string(value)
820+
821+
def normalize_struct(self, value: str, _coltype: Struct) -> str:
822+
"""Creates an SQL expression, that serialized a typed struct into a JSON string."""
823+
return self.to_string(value)
824+
825+
def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
826+
"""Creates an SQL expression, that converts 'value' to a normalized representation.
827+
828+
The returned expression must accept any SQL value, and return a string.
829+
830+
The default implementation dispatches to a method according to `coltype`:
831+
832+
::
833+
834+
TemporalType -> normalize_timestamp()
835+
FractionalType -> normalize_number()
836+
*else* -> to_string()
837+
838+
(`Integer` falls in the *else* category)
839+
840+
"""
841+
if isinstance(coltype, TemporalType):
842+
return self.normalize_timestamp(value, coltype)
843+
elif isinstance(coltype, FractionalType):
844+
return self.normalize_number(value, coltype)
845+
elif isinstance(coltype, ColType_UUID):
846+
return self.normalize_uuid(value, coltype)
847+
elif isinstance(coltype, Boolean):
848+
return self.normalize_boolean(value, coltype)
849+
elif isinstance(coltype, JSON):
850+
return self.normalize_json(value, coltype)
851+
elif isinstance(coltype, Array):
852+
return self.normalize_array(value, coltype)
853+
elif isinstance(coltype, Struct):
854+
return self.normalize_struct(value, coltype)
855+
return self.to_string(value)
856+
765857
def optimizer_hints(self, hints: str) -> str:
766858
return f"/*+ {hints} */ "
767859

@@ -958,10 +1050,7 @@ def _refine_coltypes(self, table_path: DbPath, col_dict: Dict[str, ColType], whe
9581050
if not text_columns:
9591051
return
9601052

961-
if isinstance(self.dialect, AbstractMixin_NormalizeValue):
962-
fields = [Code(self.dialect.normalize_uuid(self.dialect.quote(c), String_UUID())) for c in text_columns]
963-
else:
964-
fields = this[text_columns]
1053+
fields = [Code(self.dialect.normalize_uuid(self.dialect.quote(c), String_UUID())) for c in text_columns]
9651054

9661055
samples_by_row = self.query(
9671056
table(*table_path).select(*fields).where(Code(where) if where else SKIP).limit(sample_size), list

data_diff/databases/bigquery.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,6 @@
2020
Boolean,
2121
UnknownColType,
2222
)
23-
from data_diff.abcs.mixins import (
24-
AbstractMixin_MD5,
25-
AbstractMixin_NormalizeValue,
26-
)
27-
from data_diff.abcs.compiler import Compilable
28-
from data_diff.queries.api import this, table, SKIP, code
2923
from data_diff.databases.base import (
3024
BaseDialect,
3125
Database,
@@ -61,7 +55,7 @@ def import_bigquery_service_account_impersonation():
6155

6256

6357
@attrs.define(frozen=False)
64-
class Dialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
58+
class Dialect(BaseDialect):
6559
name = "BigQuery"
6660
ROUNDS_ON_PREC_LOSS = False # Technically BigQuery doesn't allow implicit rounding or truncation
6761
TYPE_CLASSES = {

data_diff/databases/clickhouse.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
Timestamp,
2525
Boolean,
2626
)
27-
from data_diff.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
2827

2928
# https://clickhouse.com/docs/en/operations/server-configuration-parameters/settings/#default-database
3029
DEFAULT_DATABASE = "default"
@@ -38,7 +37,7 @@ def import_clickhouse():
3837

3938

4039
@attrs.define(frozen=False)
41-
class Dialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
40+
class Dialect(BaseDialect):
4241
name = "Clickhouse"
4342
ROUNDS_ON_PREC_LOSS = False
4443
TYPE_CLASSES = {

data_diff/databases/databricks.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
UnknownColType,
1818
Boolean,
1919
)
20-
from data_diff.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
2120
from data_diff.databases.base import (
2221
MD5_HEXDIGITS,
2322
CHECKSUM_HEXDIGITS,
@@ -37,7 +36,7 @@ def import_databricks():
3736

3837

3938
@attrs.define(frozen=False)
40-
class Dialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
39+
class Dialect(BaseDialect):
4140
name = "Databricks"
4241
ROUNDS_ON_PREC_LOSS = True
4342
TYPE_CLASSES = {

data_diff/databases/duckdb.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,6 @@
1717
FractionalType,
1818
Boolean,
1919
)
20-
from data_diff.abcs.mixins import (
21-
AbstractMixin_MD5,
22-
AbstractMixin_NormalizeValue,
23-
)
2420
from data_diff.databases.base import (
2521
Database,
2622
BaseDialect,
@@ -41,7 +37,7 @@ def import_duckdb():
4137

4238

4339
@attrs.define(frozen=False)
44-
class Dialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
40+
class Dialect(BaseDialect):
4541
name = "DuckDB"
4642
ROUNDS_ON_PREC_LOSS = False
4743
SUPPORTS_PRIMARY_KEY = True

data_diff/databases/mssql.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import attrs
44

5-
from data_diff.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
65
from data_diff.databases.base import (
76
CHECKSUM_HEXDIGITS,
87
CHECKSUM_OFFSET,
@@ -36,11 +35,7 @@ def import_mssql():
3635

3736

3837
@attrs.define(frozen=False)
39-
class Dialect(
40-
BaseDialect,
41-
AbstractMixin_MD5,
42-
AbstractMixin_NormalizeValue,
43-
):
38+
class Dialect(BaseDialect):
4439
name = "MsSQL"
4540
ROUNDS_ON_PREC_LOSS = True
4641
SUPPORTS_PRIMARY_KEY = True

data_diff/databases/mysql.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,6 @@
1515
Boolean,
1616
Date,
1717
)
18-
from data_diff.abcs.mixins import (
19-
AbstractMixin_MD5,
20-
AbstractMixin_NormalizeValue,
21-
)
2218
from data_diff.databases.base import (
2319
ThreadedDatabase,
2420
import_helper,
@@ -41,11 +37,7 @@ def import_mysql():
4137

4238

4339
@attrs.define(frozen=False)
44-
class Dialect(
45-
BaseDialect,
46-
AbstractMixin_MD5,
47-
AbstractMixin_NormalizeValue,
48-
):
40+
class Dialect(BaseDialect):
4941
name = "MySQL"
5042
ROUNDS_ON_PREC_LOSS = True
5143
SUPPORTS_PRIMARY_KEY = True

data_diff/databases/oracle.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
TimestampTZ,
1717
FractionalType,
1818
)
19-
from data_diff.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
2019
from data_diff.databases.base import (
2120
BaseDialect,
2221
ThreadedDatabase,
@@ -42,8 +41,6 @@ def import_oracle():
4241
@attrs.define(frozen=False)
4342
class Dialect(
4443
BaseDialect,
45-
AbstractMixin_MD5,
46-
AbstractMixin_NormalizeValue,
4744
):
4845
name = "Oracle"
4946
SUPPORTS_PRIMARY_KEY = True

data_diff/databases/postgresql.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
Boolean,
1919
Date,
2020
)
21-
from data_diff.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
2221
from data_diff.databases.base import BaseDialect, ThreadedDatabase, import_helper, ConnectError
2322
from data_diff.databases.base import (
2423
MD5_HEXDIGITS,
@@ -40,7 +39,7 @@ def import_postgresql():
4039

4140

4241
@attrs.define(frozen=False)
43-
class PostgresqlDialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
42+
class PostgresqlDialect(BaseDialect):
4443
name = "PostgreSQL"
4544
ROUNDS_ON_PREC_LOSS = True
4645
SUPPORTS_PRIMARY_KEY = True

data_diff/databases/presto.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
TemporalType,
2222
Boolean,
2323
)
24-
from data_diff.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
2524
from data_diff.databases.base import (
2625
BaseDialect,
2726
Database,
@@ -52,7 +51,7 @@ def import_presto():
5251
return prestodb
5352

5453

55-
class Dialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
54+
class Dialect(BaseDialect):
5655
name = "Presto"
5756
ROUNDS_ON_PREC_LOSS = True
5857
TYPE_CLASSES = {

0 commit comments

Comments
 (0)