datafold · dlawin · Oct 25, 2023 · Oct 25, 2023 · Oct 25, 2023 · Oct 25, 2023
diff --git a/data_diff/abcs/mixins.py b/data_diff/abcs/mixins.py
@@ -122,66 +122,6 @@ def md5_as_int(self, s: str) -> str:
         "Provide SQL for computing md5 and returning an int"
 
 
-@attrs.define(frozen=False)
-class AbstractMixin_Schema(AbstractMixin):
-    """Methods for querying the database schema
-
-    TODO: Move AbstractDatabase.query_table_schema() and friends over here
-    """
-
-    def table_information(self) -> Compilable:
-        "Query to return a table of schema information about existing tables"
-        raise NotImplementedError()
-
-    @abstractmethod
-    def list_tables(self, table_schema: str, like: Compilable = None) -> Compilable:
-        """Query to select the list of tables in the schema. (query return type: table[str])
-
-        If 'like' is specified, the value is applied to the table name, using the 'like' operator.
-        """
-
-
-@attrs.define(frozen=False)
-class AbstractMixin_RandomSample(AbstractMixin):
-    @abstractmethod
-    def random_sample_n(self, tbl: str, size: int) -> str:
-        """Take a random sample of the given size, i.e. return 'size' amount of rows"""
-
-    @abstractmethod
-    def random_sample_ratio_approx(self, tbl: str, ratio: float) -> str:
-        """Take a random sample of the approximate size determined by the ratio (0..1), where 0 means no rows, and 1 means all rows
-
-        i.e. the actual mount of rows returned may vary by standard deviation.
-        """
-
-    # def random_sample_ratio(self, table: ITable, ratio: float):
-    #     """Take a random sample of the size determined by the ratio (0..1), where 0 means no rows, and 1 means all rows
-    #     """
-
-
-@attrs.define(frozen=False)
-class AbstractMixin_TimeTravel(AbstractMixin):
-    @abstractmethod
-    def time_travel(
-        self,
-        table: Compilable,
-        before: bool = False,
-        timestamp: Compilable = None,
-        offset: Compilable = None,
-        statement: Compilable = None,
-    ) -> Compilable:
-        """Selects historical data from a table
-
-        Parameters:
-            table - The name of the table whose history we're querying
-            timestamp - A constant timestamp
-            offset - the time 'offset' seconds before now
-            statement - identifier for statement, e.g. query ID
-
-        Must specify exactly one of `timestamp`, `offset` or `statement`.
-        """
-
-
 @attrs.define(frozen=False)
 class AbstractMixin_OptimizerHints(AbstractMixin):
     @abstractmethod

diff --git a/data_diff/databases/base.py b/data_diff/databases/base.py
@@ -48,7 +48,6 @@
     TableAlias,
     TableOp,
     TablePath,
-    TimeTravel,
     TruncateTable,
     UnaryOp,
     WhenThen,
@@ -74,10 +73,8 @@
     Boolean,
     JSON,
 )
-from data_diff.abcs.mixins import AbstractMixin_TimeTravel, Compilable
+from data_diff.abcs.mixins import Compilable
 from data_diff.abcs.mixins import (
-    AbstractMixin_Schema,
-    AbstractMixin_RandomSample,
     AbstractMixin_NormalizeValue,
     AbstractMixin_OptimizerHints,
 )
@@ -201,33 +198,6 @@ def apply_query(callback: Callable[[str], Any], sql_code: Union[str, ThreadLocal
         return callback(sql_code)
 
 
-@attrs.define(frozen=False)
-class Mixin_Schema(AbstractMixin_Schema):
-    def table_information(self) -> Compilable:
-        return table("information_schema", "tables")
-
-    def list_tables(self, table_schema: str, like: Compilable = None) -> Compilable:
-        return (
-            self.table_information()
-            .where(
-                this.table_schema == table_schema,
-                this.table_name.like(like) if like is not None else SKIP,
-                this.table_type == "BASE TABLE",
-            )
-            .select(this.table_name)
-        )
-
-
-@attrs.define(frozen=False)
-class Mixin_RandomSample(AbstractMixin_RandomSample):
-    def random_sample_n(self, tbl: ITable, size: int) -> ITable:
-        # TODO use a more efficient algorithm, when the table count is known
-        return tbl.order_by(Random()).limit(size)
-
-    def random_sample_ratio_approx(self, tbl: ITable, ratio: float) -> ITable:
-        return tbl.where(Random() < ratio)
-
-
 @attrs.define(frozen=False)
 class Mixin_OptimizerHints(AbstractMixin_OptimizerHints):
     def optimizer_hints(self, hints: str) -> str:
@@ -338,8 +308,6 @@ def render_compilable(self, c: Compiler, elem: Compilable) -> str:
             return self.render_explain(c, elem)
         elif isinstance(elem, CurrentTimestamp):
             return self.render_currenttimestamp(c, elem)
-        elif isinstance(elem, TimeTravel):
-            return self.render_timetravel(c, elem)
         elif isinstance(elem, CreateTable):
             return self.render_createtable(c, elem)
         elif isinstance(elem, DropTable):
@@ -616,16 +584,6 @@ def render_explain(self, c: Compiler, elem: Explain) -> str:
     def render_currenttimestamp(self, c: Compiler, elem: CurrentTimestamp) -> str:
         return self.current_timestamp()
 
-    def render_timetravel(self, c: Compiler, elem: TimeTravel) -> str:
-        assert isinstance(c, AbstractMixin_TimeTravel)
-        return self.compile(
-            c,
-            # TODO: why is it c.? why not self? time-trvelling is the dialect's thing, isnt't it?
-            c.time_travel(
-                elem.table, before=elem.before, timestamp=elem.timestamp, offset=elem.offset, statement=elem.statement
-            ),
-        )
-
     def render_createtable(self, c: Compiler, elem: CreateTable) -> str:
         ne = "IF NOT EXISTS " if elem.if_not_exists else ""
         if elem.source_table:
@@ -1043,10 +1001,6 @@ def _refine_coltypes(self, table_path: DbPath, col_dict: Dict[str, ColType], whe
                         assert col_name in col_dict
                         col_dict[col_name] = String_VaryingAlphanum()
 
-    # @lru_cache()
-    # def get_table_schema(self, path: DbPath) -> Dict[str, ColType]:
-    #     return self.query_table_schema(path)
-
     def _normalize_table_path(self, path: DbPath) -> DbPath:
         if len(path) == 1:
             return self.default_schema, path[0]
@@ -1080,9 +1034,6 @@ def close(self):
         self.is_closed = True
         return super().close()
 
-    def list_tables(self, tables_like, schema=None):
-        return self.query(self.dialect.list_tables(schema or self.default_schema, tables_like))
-
     @property
     @abstractmethod
     def dialect(self) -> BaseDialect:

diff --git a/data_diff/databases/bigquery.py b/data_diff/databases/bigquery.py
@@ -23,8 +23,6 @@
 from data_diff.abcs.mixins import (
     AbstractMixin_MD5,
     AbstractMixin_NormalizeValue,
-    AbstractMixin_Schema,
-    AbstractMixin_TimeTravel,
 )
 from data_diff.abcs.compiler import Compilable
 from data_diff.queries.api import this, table, SKIP, code
@@ -63,9 +61,7 @@ def import_bigquery_service_account_impersonation():
 
 
 @attrs.define(frozen=False)
-class Dialect(
-    BaseDialect, AbstractMixin_Schema, AbstractMixin_MD5, AbstractMixin_NormalizeValue, AbstractMixin_TimeTravel
-):
+class Dialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
     name = "BigQuery"
     ROUNDS_ON_PREC_LOSS = False  # Technically BigQuery doesn't allow implicit rounding or truncation
     TYPE_CLASSES = {
@@ -186,42 +182,6 @@ def normalize_struct(self, value: str, _coltype: Struct) -> str:
         # match on both sides: i.e. have properly ordered keys, same spacing, same quotes, etc.
         return f"to_json_string({value})"
 
-    def list_tables(self, table_schema: str, like: Compilable = None) -> Compilable:
-        return (
-            table(table_schema, "INFORMATION_SCHEMA", "TABLES")
-            .where(
-                this.table_schema == table_schema,
-                this.table_name.like(like) if like is not None else SKIP,
-                this.table_type == "BASE TABLE",
-            )
-            .select(this.table_name)
-        )
-
-    def time_travel(
-        self,
-        table: Compilable,
-        before: bool = False,
-        timestamp: Compilable = None,
-        offset: Compilable = None,
-        statement: Compilable = None,
-    ) -> Compilable:
-        if before:
-            raise NotImplementedError("before=True not supported for BigQuery time-travel")
-
-        if statement is not None:
-            raise NotImplementedError("BigQuery time-travel doesn't support querying by statement id")
-
-        if timestamp is not None:
-            assert offset is None
-            return code("{table} FOR SYSTEM_TIME AS OF {timestamp}", table=table, timestamp=timestamp)
-
-        assert offset is not None
-        return code(
-            "{table} FOR SYSTEM_TIME AS OF TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL {offset} HOUR);",
-            table=table,
-            offset=offset,
-        )
-
 
 @attrs.define(frozen=False, init=False, kw_only=True)
 class BigQuery(Database):

diff --git a/data_diff/databases/duckdb.py b/data_diff/databases/duckdb.py
@@ -20,7 +20,6 @@
 from data_diff.abcs.mixins import (
     AbstractMixin_MD5,
     AbstractMixin_NormalizeValue,
-    AbstractMixin_RandomSample,
 )
 from data_diff.databases.base import (
     Database,
@@ -31,9 +30,7 @@
     TIMESTAMP_PRECISION_POS,
     CHECKSUM_OFFSET,
 )
-from data_diff.databases.base import MD5_HEXDIGITS, CHECKSUM_HEXDIGITS, Mixin_Schema
-from data_diff.queries.ast_classes import ITable
-from data_diff.queries.api import code
+from data_diff.databases.base import MD5_HEXDIGITS, CHECKSUM_HEXDIGITS
 
 
 @import_helper("duckdb")
@@ -44,7 +41,7 @@ def import_duckdb():
 
 
 @attrs.define(frozen=False)
-class Dialect(BaseDialect, Mixin_Schema, AbstractMixin_MD5, AbstractMixin_NormalizeValue, AbstractMixin_RandomSample):
+class Dialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
     name = "DuckDB"
     ROUNDS_ON_PREC_LOSS = False
     SUPPORTS_PRIMARY_KEY = True
@@ -120,12 +117,6 @@ def normalize_number(self, value: str, coltype: FractionalType) -> str:
     def normalize_boolean(self, value: str, _coltype: Boolean) -> str:
         return self.to_string(f"{value}::INTEGER")
 
-    def random_sample_n(self, tbl: ITable, size: int) -> ITable:
-        return code("SELECT * FROM ({tbl}) USING SAMPLE {size};", tbl=tbl, size=size)
-
-    def random_sample_ratio_approx(self, tbl: ITable, ratio: float) -> ITable:
-        return code("SELECT * FROM ({tbl}) USING SAMPLE {percent}%;", tbl=tbl, percent=int(100 * ratio))
-
 
 @attrs.define(frozen=False, init=False, kw_only=True)
 class DuckDB(Database):

diff --git a/data_diff/databases/mssql.py b/data_diff/databases/mssql.py
@@ -13,7 +13,6 @@
     ConnectError,
     BaseDialect,
 )
-from data_diff.databases.base import Mixin_Schema
 from data_diff.abcs.database_types import (
     JSON,
     NumericType,
@@ -40,7 +39,6 @@ def import_mssql():
 @attrs.define(frozen=False)
 class Dialect(
     BaseDialect,
-    Mixin_Schema,
     Mixin_OptimizerHints,
     AbstractMixin_MD5,
     AbstractMixin_NormalizeValue,

diff --git a/data_diff/databases/mysql.py b/data_diff/databases/mysql.py
@@ -31,7 +31,6 @@
     CHECKSUM_HEXDIGITS,
     TIMESTAMP_PRECISION_POS,
     CHECKSUM_OFFSET,
-    Mixin_Schema,
 )
 
 
@@ -45,7 +44,6 @@ def import_mysql():
 @attrs.define(frozen=False)
 class Dialect(
     BaseDialect,
-    Mixin_Schema,
     Mixin_OptimizerHints,
     AbstractMixin_MD5,
     AbstractMixin_NormalizeValue,

diff --git a/data_diff/databases/oracle.py b/data_diff/databases/oracle.py
@@ -16,9 +16,7 @@
     TimestampTZ,
     FractionalType,
 )
-from data_diff.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue, AbstractMixin_Schema
-from data_diff.abcs.compiler import Compilable
-from data_diff.queries.api import this, table, SKIP
+from data_diff.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
 from data_diff.databases.base import (
     BaseDialect,
     Mixin_OptimizerHints,
@@ -46,7 +44,6 @@ def import_oracle():
 class Dialect(
     BaseDialect,
     Mixin_OptimizerHints,
-    AbstractMixin_Schema,
     AbstractMixin_MD5,
     AbstractMixin_NormalizeValue,
 ):
@@ -162,16 +159,6 @@ def normalize_number(self, value: str, coltype: FractionalType) -> str:
             format_str += "0." + "9" * (coltype.precision - 1) + "0"
         return f"to_char({value}, '{format_str}')"
 
-    def list_tables(self, table_schema: str, like: Compilable = None) -> Compilable:
-        return (
-            table("ALL_TABLES")
-            .where(
-                this.OWNER == table_schema,
-                this.TABLE_NAME.like(like) if like is not None else SKIP,
-            )
-            .select(table_name=this.TABLE_NAME)
-        )
-
 
 @attrs.define(frozen=False, init=False, kw_only=True)
 class Oracle(ThreadedDatabase):

diff --git a/data_diff/databases/postgresql.py b/data_diff/databases/postgresql.py
@@ -19,7 +19,7 @@
     Date,
 )
 from data_diff.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
-from data_diff.databases.base import BaseDialect, ThreadedDatabase, import_helper, ConnectError, Mixin_Schema
+from data_diff.databases.base import BaseDialect, ThreadedDatabase, import_helper, ConnectError
 from data_diff.databases.base import (
     MD5_HEXDIGITS,
     CHECKSUM_HEXDIGITS,
@@ -40,7 +40,7 @@ def import_postgresql():
 
 
 @attrs.define(frozen=False)
-class PostgresqlDialect(BaseDialect, Mixin_Schema, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
+class PostgresqlDialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
     name = "PostgreSQL"
     ROUNDS_ON_PREC_LOSS = True
     SUPPORTS_PRIMARY_KEY = True

diff --git a/data_diff/databases/presto.py b/data_diff/databases/presto.py
@@ -27,7 +27,6 @@
     Database,
     import_helper,
     ThreadLocalInterpreter,
-    Mixin_Schema,
 )
 from data_diff.databases.base import (
     MD5_HEXDIGITS,
@@ -53,7 +52,7 @@ def import_presto():
     return prestodb
 
 
-class Dialect(BaseDialect, Mixin_Schema, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
+class Dialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
     name = "Presto"
     ROUNDS_ON_PREC_LOSS = True
     TYPE_CLASSES = {