Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Simplify: Remove unused code & mixins #753

Merged
merged 3 commits into from
Oct 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 0 additions & 60 deletions data_diff/abcs/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,66 +122,6 @@ def md5_as_int(self, s: str) -> str:
"Provide SQL for computing md5 and returning an int"


@attrs.define(frozen=False)
class AbstractMixin_Schema(AbstractMixin):
"""Methods for querying the database schema

TODO: Move AbstractDatabase.query_table_schema() and friends over here
"""

def table_information(self) -> Compilable:
"Query to return a table of schema information about existing tables"
raise NotImplementedError()

@abstractmethod
def list_tables(self, table_schema: str, like: Compilable = None) -> Compilable:
"""Query to select the list of tables in the schema. (query return type: table[str])

If 'like' is specified, the value is applied to the table name, using the 'like' operator.
"""


@attrs.define(frozen=False)
class AbstractMixin_RandomSample(AbstractMixin):
@abstractmethod
def random_sample_n(self, tbl: str, size: int) -> str:
"""Take a random sample of the given size, i.e. return 'size' amount of rows"""

@abstractmethod
def random_sample_ratio_approx(self, tbl: str, ratio: float) -> str:
"""Take a random sample of the approximate size determined by the ratio (0..1), where 0 means no rows, and 1 means all rows

i.e. the actual mount of rows returned may vary by standard deviation.
"""

# def random_sample_ratio(self, table: ITable, ratio: float):
# """Take a random sample of the size determined by the ratio (0..1), where 0 means no rows, and 1 means all rows
# """


@attrs.define(frozen=False)
class AbstractMixin_TimeTravel(AbstractMixin):
@abstractmethod
def time_travel(
self,
table: Compilable,
before: bool = False,
timestamp: Compilable = None,
offset: Compilable = None,
statement: Compilable = None,
) -> Compilable:
"""Selects historical data from a table

Parameters:
table - The name of the table whose history we're querying
timestamp - A constant timestamp
offset - the time 'offset' seconds before now
statement - identifier for statement, e.g. query ID

Must specify exactly one of `timestamp`, `offset` or `statement`.
"""


@attrs.define(frozen=False)
class AbstractMixin_OptimizerHints(AbstractMixin):
@abstractmethod
Expand Down
51 changes: 1 addition & 50 deletions data_diff/databases/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@
TableAlias,
TableOp,
TablePath,
TimeTravel,
TruncateTable,
UnaryOp,
WhenThen,
Expand All @@ -74,10 +73,8 @@
Boolean,
JSON,
)
from data_diff.abcs.mixins import AbstractMixin_TimeTravel, Compilable
from data_diff.abcs.mixins import Compilable
from data_diff.abcs.mixins import (
AbstractMixin_Schema,
AbstractMixin_RandomSample,
AbstractMixin_NormalizeValue,
AbstractMixin_OptimizerHints,
)
Expand Down Expand Up @@ -201,33 +198,6 @@ def apply_query(callback: Callable[[str], Any], sql_code: Union[str, ThreadLocal
return callback(sql_code)


@attrs.define(frozen=False)
class Mixin_Schema(AbstractMixin_Schema):
def table_information(self) -> Compilable:
return table("information_schema", "tables")

def list_tables(self, table_schema: str, like: Compilable = None) -> Compilable:
return (
self.table_information()
.where(
this.table_schema == table_schema,
this.table_name.like(like) if like is not None else SKIP,
this.table_type == "BASE TABLE",
)
.select(this.table_name)
)


@attrs.define(frozen=False)
class Mixin_RandomSample(AbstractMixin_RandomSample):
def random_sample_n(self, tbl: ITable, size: int) -> ITable:
# TODO use a more efficient algorithm, when the table count is known
return tbl.order_by(Random()).limit(size)

def random_sample_ratio_approx(self, tbl: ITable, ratio: float) -> ITable:
return tbl.where(Random() < ratio)


@attrs.define(frozen=False)
class Mixin_OptimizerHints(AbstractMixin_OptimizerHints):
def optimizer_hints(self, hints: str) -> str:
Expand Down Expand Up @@ -338,8 +308,6 @@ def render_compilable(self, c: Compiler, elem: Compilable) -> str:
return self.render_explain(c, elem)
elif isinstance(elem, CurrentTimestamp):
return self.render_currenttimestamp(c, elem)
elif isinstance(elem, TimeTravel):
return self.render_timetravel(c, elem)
elif isinstance(elem, CreateTable):
return self.render_createtable(c, elem)
elif isinstance(elem, DropTable):
Expand Down Expand Up @@ -616,16 +584,6 @@ def render_explain(self, c: Compiler, elem: Explain) -> str:
def render_currenttimestamp(self, c: Compiler, elem: CurrentTimestamp) -> str:
return self.current_timestamp()

def render_timetravel(self, c: Compiler, elem: TimeTravel) -> str:
assert isinstance(c, AbstractMixin_TimeTravel)
return self.compile(
c,
# TODO: why is it c.? why not self? time-trvelling is the dialect's thing, isnt't it?
c.time_travel(
elem.table, before=elem.before, timestamp=elem.timestamp, offset=elem.offset, statement=elem.statement
),
)

def render_createtable(self, c: Compiler, elem: CreateTable) -> str:
ne = "IF NOT EXISTS " if elem.if_not_exists else ""
if elem.source_table:
Expand Down Expand Up @@ -1043,10 +1001,6 @@ def _refine_coltypes(self, table_path: DbPath, col_dict: Dict[str, ColType], whe
assert col_name in col_dict
col_dict[col_name] = String_VaryingAlphanum()

# @lru_cache()
# def get_table_schema(self, path: DbPath) -> Dict[str, ColType]:
# return self.query_table_schema(path)

def _normalize_table_path(self, path: DbPath) -> DbPath:
if len(path) == 1:
return self.default_schema, path[0]
Expand Down Expand Up @@ -1080,9 +1034,6 @@ def close(self):
self.is_closed = True
return super().close()

def list_tables(self, tables_like, schema=None):
return self.query(self.dialect.list_tables(schema or self.default_schema, tables_like))

@property
@abstractmethod
def dialect(self) -> BaseDialect:
Expand Down
42 changes: 1 addition & 41 deletions data_diff/databases/bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@
from data_diff.abcs.mixins import (
AbstractMixin_MD5,
AbstractMixin_NormalizeValue,
AbstractMixin_Schema,
AbstractMixin_TimeTravel,
)
from data_diff.abcs.compiler import Compilable
from data_diff.queries.api import this, table, SKIP, code
Expand Down Expand Up @@ -63,9 +61,7 @@ def import_bigquery_service_account_impersonation():


@attrs.define(frozen=False)
class Dialect(
BaseDialect, AbstractMixin_Schema, AbstractMixin_MD5, AbstractMixin_NormalizeValue, AbstractMixin_TimeTravel
):
class Dialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
name = "BigQuery"
ROUNDS_ON_PREC_LOSS = False # Technically BigQuery doesn't allow implicit rounding or truncation
TYPE_CLASSES = {
Expand Down Expand Up @@ -186,42 +182,6 @@ def normalize_struct(self, value: str, _coltype: Struct) -> str:
# match on both sides: i.e. have properly ordered keys, same spacing, same quotes, etc.
return f"to_json_string({value})"

def list_tables(self, table_schema: str, like: Compilable = None) -> Compilable:
return (
table(table_schema, "INFORMATION_SCHEMA", "TABLES")
.where(
this.table_schema == table_schema,
this.table_name.like(like) if like is not None else SKIP,
this.table_type == "BASE TABLE",
)
.select(this.table_name)
)

def time_travel(
self,
table: Compilable,
before: bool = False,
timestamp: Compilable = None,
offset: Compilable = None,
statement: Compilable = None,
) -> Compilable:
if before:
raise NotImplementedError("before=True not supported for BigQuery time-travel")

if statement is not None:
raise NotImplementedError("BigQuery time-travel doesn't support querying by statement id")

if timestamp is not None:
assert offset is None
return code("{table} FOR SYSTEM_TIME AS OF {timestamp}", table=table, timestamp=timestamp)

assert offset is not None
return code(
"{table} FOR SYSTEM_TIME AS OF TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL {offset} HOUR);",
table=table,
offset=offset,
)


@attrs.define(frozen=False, init=False, kw_only=True)
class BigQuery(Database):
Expand Down
13 changes: 2 additions & 11 deletions data_diff/databases/duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from data_diff.abcs.mixins import (
AbstractMixin_MD5,
AbstractMixin_NormalizeValue,
AbstractMixin_RandomSample,
)
from data_diff.databases.base import (
Database,
Expand All @@ -31,9 +30,7 @@
TIMESTAMP_PRECISION_POS,
CHECKSUM_OFFSET,
)
from data_diff.databases.base import MD5_HEXDIGITS, CHECKSUM_HEXDIGITS, Mixin_Schema
from data_diff.queries.ast_classes import ITable
from data_diff.queries.api import code
from data_diff.databases.base import MD5_HEXDIGITS, CHECKSUM_HEXDIGITS


@import_helper("duckdb")
Expand All @@ -44,7 +41,7 @@ def import_duckdb():


@attrs.define(frozen=False)
class Dialect(BaseDialect, Mixin_Schema, AbstractMixin_MD5, AbstractMixin_NormalizeValue, AbstractMixin_RandomSample):
class Dialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
name = "DuckDB"
ROUNDS_ON_PREC_LOSS = False
SUPPORTS_PRIMARY_KEY = True
Expand Down Expand Up @@ -120,12 +117,6 @@ def normalize_number(self, value: str, coltype: FractionalType) -> str:
def normalize_boolean(self, value: str, _coltype: Boolean) -> str:
return self.to_string(f"{value}::INTEGER")

def random_sample_n(self, tbl: ITable, size: int) -> ITable:
return code("SELECT * FROM ({tbl}) USING SAMPLE {size};", tbl=tbl, size=size)

def random_sample_ratio_approx(self, tbl: ITable, ratio: float) -> ITable:
return code("SELECT * FROM ({tbl}) USING SAMPLE {percent}%;", tbl=tbl, percent=int(100 * ratio))


@attrs.define(frozen=False, init=False, kw_only=True)
class DuckDB(Database):
Expand Down
2 changes: 0 additions & 2 deletions data_diff/databases/mssql.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
ConnectError,
BaseDialect,
)
from data_diff.databases.base import Mixin_Schema
from data_diff.abcs.database_types import (
JSON,
NumericType,
Expand All @@ -40,7 +39,6 @@ def import_mssql():
@attrs.define(frozen=False)
class Dialect(
BaseDialect,
Mixin_Schema,
Mixin_OptimizerHints,
AbstractMixin_MD5,
AbstractMixin_NormalizeValue,
Expand Down
2 changes: 0 additions & 2 deletions data_diff/databases/mysql.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
CHECKSUM_HEXDIGITS,
TIMESTAMP_PRECISION_POS,
CHECKSUM_OFFSET,
Mixin_Schema,
)


Expand All @@ -45,7 +44,6 @@ def import_mysql():
@attrs.define(frozen=False)
class Dialect(
BaseDialect,
Mixin_Schema,
Mixin_OptimizerHints,
AbstractMixin_MD5,
AbstractMixin_NormalizeValue,
Expand Down
15 changes: 1 addition & 14 deletions data_diff/databases/oracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@
TimestampTZ,
FractionalType,
)
from data_diff.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue, AbstractMixin_Schema
from data_diff.abcs.compiler import Compilable
from data_diff.queries.api import this, table, SKIP
from data_diff.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
from data_diff.databases.base import (
BaseDialect,
Mixin_OptimizerHints,
Expand Down Expand Up @@ -46,7 +44,6 @@ def import_oracle():
class Dialect(
BaseDialect,
Mixin_OptimizerHints,
AbstractMixin_Schema,
AbstractMixin_MD5,
AbstractMixin_NormalizeValue,
):
Expand Down Expand Up @@ -162,16 +159,6 @@ def normalize_number(self, value: str, coltype: FractionalType) -> str:
format_str += "0." + "9" * (coltype.precision - 1) + "0"
return f"to_char({value}, '{format_str}')"

def list_tables(self, table_schema: str, like: Compilable = None) -> Compilable:
return (
table("ALL_TABLES")
.where(
this.OWNER == table_schema,
this.TABLE_NAME.like(like) if like is not None else SKIP,
)
.select(table_name=this.TABLE_NAME)
)


@attrs.define(frozen=False, init=False, kw_only=True)
class Oracle(ThreadedDatabase):
Expand Down
4 changes: 2 additions & 2 deletions data_diff/databases/postgresql.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
Date,
)
from data_diff.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
from data_diff.databases.base import BaseDialect, ThreadedDatabase, import_helper, ConnectError, Mixin_Schema
from data_diff.databases.base import BaseDialect, ThreadedDatabase, import_helper, ConnectError
from data_diff.databases.base import (
MD5_HEXDIGITS,
CHECKSUM_HEXDIGITS,
Expand All @@ -40,7 +40,7 @@ def import_postgresql():


@attrs.define(frozen=False)
class PostgresqlDialect(BaseDialect, Mixin_Schema, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
class PostgresqlDialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
name = "PostgreSQL"
ROUNDS_ON_PREC_LOSS = True
SUPPORTS_PRIMARY_KEY = True
Expand Down
3 changes: 1 addition & 2 deletions data_diff/databases/presto.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
Database,
import_helper,
ThreadLocalInterpreter,
Mixin_Schema,
)
from data_diff.databases.base import (
MD5_HEXDIGITS,
Expand All @@ -53,7 +52,7 @@ def import_presto():
return prestodb


class Dialect(BaseDialect, Mixin_Schema, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
class Dialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
name = "Presto"
ROUNDS_ON_PREC_LOSS = True
TYPE_CLASSES = {
Expand Down
Loading