Joindiff: Added support to materialize results as tables (-m)

erezsh · erezsh · commit 5cd424dd49c0 · 2022-10-05T11:10:10.000+03:00
diff --git a/data_diff/__main__.py b/data_diff/__main__.py
@@ -9,7 +9,9 @@
 import rich
 import click
 
-from .utils import remove_password_from_url, safezip, match_like
+from data_diff.databases.base import parse_table_name
+
+from .utils import eval_name_template, remove_password_from_url, safezip, match_like
 from .diff_tables import Algorithm
 from .hashdiff_tables import HashDiffer, DEFAULT_BISECTION_THRESHOLD, DEFAULT_BISECTION_FACTOR
 from .joindiff_tables import JoinDiffer
@@ -104,6 +106,7 @@ def write_usage(self, prog: str, args: str = "", prefix: Optional[str] = None) -
     help=f"Minimal bisection threshold. Below it, data-diff will download the data and compare it locally. Default={DEFAULT_BISECTION_THRESHOLD}.",
     metavar="NUM",
 )
+@click.option("-m", "--materialize", default=None, metavar="TABLE_NAME", help="Materialize the diff results into a new table in the database.")
 @click.option(
     "--min-age",
     default=None,
@@ -126,6 +129,11 @@ def write_usage(self, prog: str, args: str = "", prefix: Optional[str] = None) -
     is_flag=True,
     help="Column names are treated as case-sensitive. Otherwise, data-diff corrects their case according to schema.",
 )
+@click.option(
+    "--assume-unique-key",
+    is_flag=True,
+    help="Skip validating the uniqueness of the key column during joindiff, which is costly in non-cloud dbs.",
+)
 @click.option(
     "-j",
     "--threads",
@@ -192,6 +200,8 @@ def _main(
     case_sensitive,
     json_output,
     where,
+    assume_unique_key,
+    materialize,
     threads1=None,
     threads2=None,
     __conf__=None,
@@ -256,6 +266,8 @@ def _main(
         differ = JoinDiffer(
             threaded=threaded,
             max_threadpool_size=threads and threads * 2,
+            validate_unique_key = not assume_unique_key,
+            materialize_to_table = materialize and parse_table_name(eval_name_template(materialize)),
         )
     else:
         assert algorithm == Algorithm.HASHDIFF
diff --git a/data_diff/databases/base.py b/data_diff/databases/base.py
@@ -107,7 +107,7 @@ class Database(AbstractDatabase):
     def name(self):
         return type(self).__name__
 
-    def query(self, sql_ast: Expr, res_type: type):
+    def query(self, sql_ast: Expr, res_type: type = None):
         "Query the given SQL code/AST, and attempt to convert the result to type 'res_type'"
 
         compiler = Compiler(self)
diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py
@@ -68,7 +68,7 @@ def _threaded_call_as_completed(self, func, iterable):
     @contextmanager
     def _run_in_background(self, *funcs):
         with ThreadPoolExecutor(max_workers=self.max_threadpool_size) as task_pool:
-            futures = [task_pool.submit(f) for f in funcs]
+            futures = [task_pool.submit(f) for f in funcs if f is not None]
             yield futures
             for f in futures:
                 f.result()
diff --git a/data_diff/joindiff_tables.py b/data_diff/joindiff_tables.py
@@ -5,10 +5,12 @@
 from decimal import Decimal
 from functools import partial
 import logging
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 from runtype import dataclass
 
+from data_diff.databases.database_types import DbPath, Schema
+
 
 from .utils import safezip
 from .databases.base import Database
@@ -17,15 +19,16 @@
 from .diff_tables import TableDiffer, DiffResult
 from .thread_utils import ThreadedYielder
 
-from .queries import table, sum_, min_, max_, avg
+from .queries import table, sum_, min_, max_, avg, SKIP
 from .queries.api import and_, if_, or_, outerjoin, leftjoin, rightjoin, this, ITable
-from .queries.ast_classes import Concat, Count, Expr, Random
+from .queries.ast_classes import Concat, Count, Expr, Random, TablePath
 from .queries.compiler import Compiler
 from .queries.extras import NormalizeAsString
 
-
 logger = logging.getLogger("joindiff_tables")
 
+WRITE_LIMIT = 1000
+
 
 def merge_dicts(dicts):
     i = iter(dicts)
@@ -60,6 +63,18 @@ def create_temp_table(c: Compiler, name: str, expr: Expr):
         return f"create temporary table {c.quote(name)} as {c.compile(expr)}"
 
 
+def drop_table(db, name: DbPath):
+    t = TablePath(name)
+    db.query(t.drop(if_exists=True))
+
+def append_to_table(name: DbPath, expr: Expr):
+    t = TablePath(name, expr.schema)
+    yield t.create(if_not_exists=True)  # uses expr.schema
+    yield 'commit'
+    yield t.insert_expr(expr)
+    yield 'commit'
+
+
 def bool_to_int(x):
     return if_(x, 1, 0)
 
@@ -117,6 +132,8 @@ class JoinDiffer(TableDiffer):
     stats: dict = {}
     validate_unique_key: bool = True
     sample_exclusive_rows: bool = True
+    materialize_to_table: DbPath = None
+    write_limit: int = WRITE_LIMIT
 
     def _diff_tables(self, table1: TableSegment, table2: TableSegment) -> DiffResult:
         db = table1.database
@@ -128,8 +145,12 @@ def _diff_tables(self, table1: TableSegment, table2: TableSegment) -> DiffResult
 
 
         bg_funcs = [partial(self._test_duplicate_keys, table1, table2)] if self.validate_unique_key else []
+        if self.materialize_to_table:
+            drop_table(db, self.materialize_to_table)
+            db.query('COMMIT')
 
         with self._run_in_background(*bg_funcs):
+
             if isinstance(db, (Snowflake, BigQuery)):
                 # Don't segment the table; let the database handling parallelization
                 yield from self._diff_segments(None, table1, table2, None)
@@ -147,12 +168,29 @@ def _diff_segments(self, ti: ThreadedYielder, table1: TableSegment, table2: Tabl
                 f"size <= {max_rows}"
             )
 
+        db = table1.database
+        diff_rows, a_cols, b_cols, is_diff_cols = self._create_outer_join(table1, table2)
+
         with self._run_in_background(
                 partial(self._collect_stats, 1, table1),
                 partial(self._collect_stats, 2, table2),
                 partial(self._test_null_keys, table1, table2),
+                partial(self._sample_and_count_exclusive, db, diff_rows, a_cols, b_cols),
+                partial(self._count_diff_per_column, db, diff_rows, list(a_cols), is_diff_cols),
+                partial(self._materialize_diff, db, diff_rows, segment_index=segment_index) if self.materialize_to_table else None,
             ):
-            yield from self._outer_join(table1, table2)
+
+            logger.debug("Querying for different rows")
+            for is_xa, is_xb, *x in db.query(diff_rows, list):
+                if is_xa and is_xb:
+                    # Can't both be exclusive, meaning a pk is NULL
+                    # This can happen if the explicit null test didn't finish running yet
+                    raise ValueError(f"NULL values in one or more primary keys")
+                is_diff, a_row, b_row = _slice_tuple(x, len(is_diff_cols), len(a_cols), len(b_cols))
+                if not is_xb:
+                    yield "-", tuple(a_row)
+                if not is_xa:
+                    yield "+", tuple(b_row)
 
     def _test_duplicate_keys(self, table1, table2):
         logger.debug("Testing for duplicate keys")
@@ -162,7 +200,7 @@ def _test_duplicate_keys(self, table1, table2):
             t = ts._make_select()
             key_columns = [ts.key_column]  # XXX
 
-            q = t.select(total=Count(), total_distinct=Count(Concat(key_columns), distinct=True))
+            q = t.select(total=Count(), total_distinct=Count(Concat(this[key_columns]), distinct=True))
             total, total_distinct = ts.database.query(q, tuple)
             if total != total_distinct:
                 raise ValueError("Duplicate primary keys")
@@ -175,7 +213,7 @@ def _test_null_keys(self, table1, table2):
             t = ts._make_select()
             key_columns = [ts.key_column]  # XXX
 
-            q = t.select(*key_columns).where(or_(this[k] == None for k in key_columns))
+            q = t.select(*this[key_columns]).where(or_(this[k] == None for k in key_columns))
             nulls = ts.database.query(q, list)
             if nulls:
                 raise ValueError(f"NULL values in one or more primary keys")
@@ -188,10 +226,10 @@ def _collect_stats(self, i, table):
         # Metrics
         col_exprs = merge_dicts(
             {
-                f"sum_{c}": sum_(c),
-                f"avg_{c}": avg(c),
-                f"min_{c}": min_(c),
-                f"max_{c}": max_(c),
+                f"sum_{c}": sum_(this[c]),
+                f"avg_{c}": avg(this[c]),
+                f"min_{c}": min_(this[c]),
+                f"max_{c}": max_(this[c]),
             }
             for c in table._relevant_columns
             if c == "id"  # TODO just if the right type
@@ -209,8 +247,7 @@ def _collect_stats(self, i, table):
         # stats.diff_ratio_by_column = diff_stats
         # stats.diff_ratio_total = diff_stats['total_diff']
 
-
-    def _outer_join(self, table1, table2):
+    def _create_outer_join(self, table1, table2):
         db = table1.database
         if db is not table2.database:
             raise ValueError("Joindiff only applies to tables within the same database")
@@ -239,23 +276,8 @@ def _outer_join(self, table1, table2):
             _outerjoin(db, a, b, keys1, keys2, {**is_diff_cols, **a_cols, **b_cols})
             .where(or_(this[c] == 1 for c in is_diff_cols))
         )
+        return diff_rows, a_cols, b_cols, is_diff_cols
 
-        with self._run_in_background(
-                    partial(self._sample_and_count_exclusive, db, diff_rows, a_cols, b_cols),
-                    partial(self._count_diff_per_column, db, diff_rows, cols1, is_diff_cols)
-                ):
-
-            logger.debug("Querying for different rows")
-            for is_xa, is_xb, *x in db.query(diff_rows, list):
-                if is_xa and is_xb:
-                    # Can't both be exclusive, meaning a pk is NULL
-                    # This can happen if the explicit null test didn't finish running yet
-                    raise ValueError(f"NULL values in one or more primary keys")
-                is_diff, a_row, b_row = _slice_tuple(x, len(is_diff_cols), len(a_cols), len(b_cols))
-                if not is_xb:
-                    yield "-", tuple(a_row)
-                if not is_xa:
-                    yield "+", tuple(b_row)
 
     def _count_diff_per_column(self, db, diff_rows, cols, is_diff_cols):
         logger.info("Counting differences per column")
@@ -280,7 +302,7 @@ def _sample_and_count_exclusive(self, db, diff_rows, a_cols, b_cols):
         def exclusive_rows(expr):
             c = Compiler(db)
             name = c.new_unique_table_name("temp_table")
-            yield create_temp_table(c, name, expr)
+            yield create_temp_table(c, name, expr.limit(self.write_limit))
             exclusive_rows = table(name, schema=expr.source_table.schema)
 
             count = yield exclusive_rows.count()
@@ -293,3 +315,10 @@ def exclusive_rows(expr):
 
         # Run as a sequence of thread-local queries (compiled into a ThreadLocalInterpreter)
         db.query(exclusive_rows(exclusive_rows_query), None)
+
+    def _materialize_diff(self, db, diff_rows, segment_index=None):
+        assert self.materialize_to_table
+
+        db.query(append_to_table(self.materialize_to_table, diff_rows.limit(self.write_limit)))
+        logger.info(f"Materialized diff to table '{'.'.join(self.materialize_to_table)}'.")
+
diff --git a/data_diff/queries/ast_classes.py b/data_diff/queries/ast_classes.py
@@ -140,7 +140,7 @@ class Concat(ExprNode):
 
     def compile(self, c: Compiler) -> str:
         # We coalesce because on some DBs (e.g. MySQL) concat('a', NULL) is NULL
-        items = [f"coalesce({c.compile(c.database.to_string(expr))}, '<null>')" for expr in self.exprs]
+        items = [f"coalesce({c.compile(c.database.to_string(c.compile(expr)))}, '<null>')" for expr in self.exprs]
         assert items
         if len(items) == 1:
             return items[0]
@@ -294,6 +294,9 @@ def create(self, if_not_exists=False):
             raise ValueError("Schema must have a value to create table")
         return CreateTable(self, if_not_exists=if_not_exists)
 
+    def drop(self, if_exists=False):
+        return DropTable(self, if_exists=if_exists)
+
     def insert_values(self, rows):
         raise NotImplementedError()
 
@@ -513,13 +516,13 @@ def resolve_names(source_table, exprs):
         if isinstance(expr, ExprNode):
             for v in expr._dfs_values():
                 if isinstance(v, _ResolveColumn):
-                    v.resolve(source_table._get_column(v.name))
+                    v.resolve(source_table._get_column(v.resolve_name))
                     i += 1
 
 
 @dataclass(frozen=False, eq=False, order=False)
 class _ResolveColumn(ExprNode, LazyOps):
-    name: str
+    resolve_name: str
     resolved: Expr = None
 
     def resolve(self, expr):
@@ -528,15 +531,22 @@ def resolve(self, expr):
 
     def compile(self, c: Compiler) -> str:
         if self.resolved is None:
-            raise RuntimeError(f"Column not resolved: {self.name}")
+            raise RuntimeError(f"Column not resolved: {self.resolve_name}")
         return self.resolved.compile(c)
 
     @property
     def type(self):
         if self.resolved is None:
-            raise RuntimeError(f"Column not resolved: {self.name}")
+            raise RuntimeError(f"Column not resolved: {self.resolve_name}")
         return self.resolved.type
 
+    @property
+    def name(self):
+        if self.resolved is None:
+            raise RuntimeError(f"Column not resolved: {self.name}")
+        return self.resolved.name
+
+
 
 class This:
     def __getattr__(self, name):
@@ -606,6 +616,15 @@ def compile(self, c: Compiler) -> str:
         ne = 'IF NOT EXISTS ' if self.if_not_exists else ''
         return f'CREATE TABLE {ne}{c.compile(self.path)}({schema})'
 
+@dataclass
+class DropTable(Statement):
+    path: TablePath
+    if_exists: bool = False
+
+    def compile(self, c: Compiler) -> str:
+        ie = 'IF EXISTS ' if self.if_exists else ''
+        return f'DROP TABLE {ie}{c.compile(self.path)}'
+
 @dataclass
 class InsertToTable(Statement):
     # TODO Support insert for only some columns
diff --git a/data_diff/utils.py b/data_diff/utils.py
@@ -9,6 +9,7 @@
 import operator
 import string
 import threading
+from datetime import datetime
 
 alphanums = " -" + string.digits + string.ascii_uppercase + "_" + string.ascii_lowercase
 
@@ -295,3 +296,8 @@ def run_as_daemon(threadfunc, *args):
 
 def getLogger(name):
     return logging.getLogger(name.rsplit('.', 1)[-1])
+
+def eval_name_template(name):
+    def get_timestamp(m):
+        return datetime.now().isoformat('_', 'seconds').replace(':', '_')
+    return re.sub('%t', get_timestamp, name)