Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 4a21935

Browse files
authored
Merge pull request #754 from datafold/squash-abstract-mixins
Simplify: Squash the MD5 & NormalizerValue & OptimizerHints mixins into the base dialect
2 parents 038cf17 + 9aa6e55 commit 4a21935

File tree

14 files changed

+117
-202
lines changed

14 files changed

+117
-202
lines changed

data_diff/abcs/mixins.py

Lines changed: 0 additions & 133 deletions
This file was deleted.

data_diff/databases/base.py

Lines changed: 96 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import attrs
1818
from typing_extensions import Self
1919

20-
from data_diff.abcs.compiler import AbstractCompiler
20+
from data_diff.abcs.compiler import AbstractCompiler, Compilable
2121
from data_diff.queries.extras import ApplyFuncAndNormalizeAsString, Checksum, NormalizeAsString
2222
from data_diff.utils import ArithString, is_uuid, join_iter, safezip
2323
from data_diff.queries.api import Expr, table, Select, SKIP, Explain, Code, this
@@ -55,6 +55,8 @@
5555
)
5656
from data_diff.abcs.database_types import (
5757
Array,
58+
ColType_UUID,
59+
FractionalType,
5860
Struct,
5961
ColType,
6062
Integer,
@@ -73,11 +75,6 @@
7375
Boolean,
7476
JSON,
7577
)
76-
from data_diff.abcs.mixins import Compilable
77-
from data_diff.abcs.mixins import (
78-
AbstractMixin_NormalizeValue,
79-
AbstractMixin_OptimizerHints,
80-
)
8178

8279
logger = logging.getLogger("database")
8380
cv_params = contextvars.ContextVar("params")
@@ -198,12 +195,6 @@ def apply_query(callback: Callable[[str], Any], sql_code: Union[str, ThreadLocal
198195
return callback(sql_code)
199196

200197

201-
@attrs.define(frozen=False)
202-
class Mixin_OptimizerHints(AbstractMixin_OptimizerHints):
203-
def optimizer_hints(self, hints: str) -> str:
204-
return f"/*+ {hints} */ "
205-
206-
207198
@attrs.define(frozen=False)
208199
class BaseDialect(abc.ABC):
209200
SUPPORTS_PRIMARY_KEY: ClassVar[bool] = False
@@ -771,6 +762,98 @@ def to_string(self, s: str) -> str:
771762
def set_timezone_to_utc(self) -> str:
772763
"Provide SQL for setting the session timezone to UTC"
773764

765+
@abstractmethod
766+
def md5_as_int(self, s: str) -> str:
767+
"Provide SQL for computing md5 and returning an int"
768+
769+
@abstractmethod
770+
def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
771+
"""Creates an SQL expression, that converts 'value' to a normalized timestamp.
772+
773+
The returned expression must accept any SQL datetime/timestamp, and return a string.
774+
775+
Date format: ``YYYY-MM-DD HH:mm:SS.FFFFFF``
776+
777+
Precision of dates should be rounded up/down according to coltype.rounds
778+
"""
779+
780+
@abstractmethod
781+
def normalize_number(self, value: str, coltype: FractionalType) -> str:
782+
"""Creates an SQL expression, that converts 'value' to a normalized number.
783+
784+
The returned expression must accept any SQL int/numeric/float, and return a string.
785+
786+
Floats/Decimals are expected in the format
787+
"I.P"
788+
789+
Where I is the integer part of the number (as many digits as necessary),
790+
and must be at least one digit (0).
791+
P is the fractional digits, the amount of which is specified with
792+
coltype.precision. Trailing zeroes may be necessary.
793+
If P is 0, the dot is omitted.
794+
795+
Note: We use 'precision' differently than most databases. For decimals,
796+
it's the same as ``numeric_scale``, and for floats, who use binary precision,
797+
it can be calculated as ``log10(2**numeric_precision)``.
798+
"""
799+
800+
def normalize_boolean(self, value: str, _coltype: Boolean) -> str:
801+
"""Creates an SQL expression, that converts 'value' to either '0' or '1'."""
802+
return self.to_string(value)
803+
804+
def normalize_uuid(self, value: str, coltype: ColType_UUID) -> str:
805+
"""Creates an SQL expression, that strips uuids of artifacts like whitespace."""
806+
if isinstance(coltype, String_UUID):
807+
return f"TRIM({value})"
808+
return self.to_string(value)
809+
810+
def normalize_json(self, value: str, _coltype: JSON) -> str:
811+
"""Creates an SQL expression, that converts 'value' to its minified json string representation."""
812+
return self.to_string(value)
813+
814+
def normalize_array(self, value: str, _coltype: Array) -> str:
815+
"""Creates an SQL expression, that serialized an array into a JSON string."""
816+
return self.to_string(value)
817+
818+
def normalize_struct(self, value: str, _coltype: Struct) -> str:
819+
"""Creates an SQL expression, that serialized a typed struct into a JSON string."""
820+
return self.to_string(value)
821+
822+
def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
823+
"""Creates an SQL expression, that converts 'value' to a normalized representation.
824+
825+
The returned expression must accept any SQL value, and return a string.
826+
827+
The default implementation dispatches to a method according to `coltype`:
828+
829+
::
830+
831+
TemporalType -> normalize_timestamp()
832+
FractionalType -> normalize_number()
833+
*else* -> to_string()
834+
835+
(`Integer` falls in the *else* category)
836+
837+
"""
838+
if isinstance(coltype, TemporalType):
839+
return self.normalize_timestamp(value, coltype)
840+
elif isinstance(coltype, FractionalType):
841+
return self.normalize_number(value, coltype)
842+
elif isinstance(coltype, ColType_UUID):
843+
return self.normalize_uuid(value, coltype)
844+
elif isinstance(coltype, Boolean):
845+
return self.normalize_boolean(value, coltype)
846+
elif isinstance(coltype, JSON):
847+
return self.normalize_json(value, coltype)
848+
elif isinstance(coltype, Array):
849+
return self.normalize_array(value, coltype)
850+
elif isinstance(coltype, Struct):
851+
return self.normalize_struct(value, coltype)
852+
return self.to_string(value)
853+
854+
def optimizer_hints(self, hints: str) -> str:
855+
return f"/*+ {hints} */ "
856+
774857

775858
T = TypeVar("T", bound=BaseDialect)
776859

@@ -966,10 +1049,7 @@ def _refine_coltypes(
9661049
if not text_columns:
9671050
return
9681051

969-
if isinstance(self.dialect, AbstractMixin_NormalizeValue):
970-
fields = [Code(self.dialect.normalize_uuid(self.dialect.quote(c), String_UUID())) for c in text_columns]
971-
else:
972-
fields = this[text_columns]
1052+
fields = [Code(self.dialect.normalize_uuid(self.dialect.quote(c), String_UUID())) for c in text_columns]
9731053

9741054
samples_by_row = self.query(
9751055
table(*table_path).select(*fields).where(Code(where) if where else SKIP).limit(sample_size), list

data_diff/databases/bigquery.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,6 @@
2020
Boolean,
2121
UnknownColType,
2222
)
23-
from data_diff.abcs.mixins import (
24-
AbstractMixin_MD5,
25-
AbstractMixin_NormalizeValue,
26-
)
27-
from data_diff.abcs.compiler import Compilable
28-
from data_diff.queries.api import this, table, SKIP, code
2923
from data_diff.databases.base import (
3024
BaseDialect,
3125
Database,
@@ -61,7 +55,7 @@ def import_bigquery_service_account_impersonation():
6155

6256

6357
@attrs.define(frozen=False)
64-
class Dialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
58+
class Dialect(BaseDialect):
6559
name = "BigQuery"
6660
ROUNDS_ON_PREC_LOSS = False # Technically BigQuery doesn't allow implicit rounding or truncation
6761
TYPE_CLASSES = {

data_diff/databases/clickhouse.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
Timestamp,
2525
Boolean,
2626
)
27-
from data_diff.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
2827

2928
# https://clickhouse.com/docs/en/operations/server-configuration-parameters/settings/#default-database
3029
DEFAULT_DATABASE = "default"
@@ -38,7 +37,7 @@ def import_clickhouse():
3837

3938

4039
@attrs.define(frozen=False)
41-
class Dialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
40+
class Dialect(BaseDialect):
4241
name = "Clickhouse"
4342
ROUNDS_ON_PREC_LOSS = False
4443
TYPE_CLASSES = {

data_diff/databases/databricks.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
UnknownColType,
1818
Boolean,
1919
)
20-
from data_diff.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
2120
from data_diff.databases.base import (
2221
MD5_HEXDIGITS,
2322
CHECKSUM_HEXDIGITS,
@@ -37,7 +36,7 @@ def import_databricks():
3736

3837

3938
@attrs.define(frozen=False)
40-
class Dialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
39+
class Dialect(BaseDialect):
4140
name = "Databricks"
4241
ROUNDS_ON_PREC_LOSS = True
4342
TYPE_CLASSES = {

data_diff/databases/duckdb.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,6 @@
1717
FractionalType,
1818
Boolean,
1919
)
20-
from data_diff.abcs.mixins import (
21-
AbstractMixin_MD5,
22-
AbstractMixin_NormalizeValue,
23-
)
2420
from data_diff.databases.base import (
2521
Database,
2622
BaseDialect,
@@ -41,7 +37,7 @@ def import_duckdb():
4137

4238

4339
@attrs.define(frozen=False)
44-
class Dialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
40+
class Dialect(BaseDialect):
4541
name = "DuckDB"
4642
ROUNDS_ON_PREC_LOSS = False
4743
SUPPORTS_PRIMARY_KEY = True

data_diff/databases/mssql.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,8 @@
22

33
import attrs
44

5-
from data_diff.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
65
from data_diff.databases.base import (
76
CHECKSUM_HEXDIGITS,
8-
Mixin_OptimizerHints,
97
CHECKSUM_OFFSET,
108
QueryError,
119
ThreadedDatabase,
@@ -37,12 +35,7 @@ def import_mssql():
3735

3836

3937
@attrs.define(frozen=False)
40-
class Dialect(
41-
BaseDialect,
42-
Mixin_OptimizerHints,
43-
AbstractMixin_MD5,
44-
AbstractMixin_NormalizeValue,
45-
):
38+
class Dialect(BaseDialect):
4639
name = "MsSQL"
4740
ROUNDS_ON_PREC_LOSS = True
4841
SUPPORTS_PRIMARY_KEY = True

0 commit comments

Comments
 (0)