Merge pull request #802 from datafold/tolerate-empty-samples

nolar · web-flow · commit 6738ca762c6e · 2023-12-15T22:24:30.000+01:00
Tolerate empty samples &amp; allow custom database schemas
diff --git a/data_diff/databases/_connect.py b/data_diff/databases/_connect.py
@@ -26,7 +26,7 @@
 from data_diff.databases.mssql import MsSQL
 
 
-@attrs.define(frozen=True)
+@attrs.frozen
 class MatchUriPath:
     database_cls: Type[Database]
 
@@ -98,13 +98,11 @@ class Connect:
     """Provides methods for connecting to a supported database using a URL or connection dict."""
 
     database_by_scheme: Dict[str, Database]
-    match_uri_path: Dict[str, MatchUriPath]
     conn_cache: MutableMapping[Hashable, Database]
 
     def __init__(self, database_by_scheme: Dict[str, Database] = DATABASE_BY_SCHEME):
         super().__init__()
         self.database_by_scheme = database_by_scheme
-        self.match_uri_path = {name: MatchUriPath(cls) for name, cls in database_by_scheme.items()}
         self.conn_cache = weakref.WeakValueDictionary()
 
     def for_databases(self, *dbs) -> Self:
@@ -157,12 +155,10 @@ def connect_to_uri(self, db_uri: str, thread_count: Optional[int] = 1, **kwargs)
             return self.connect_with_dict(conn_dict, thread_count, **kwargs)
 
         try:
-            matcher = self.match_uri_path[scheme]
+            cls = self.database_by_scheme[scheme]
         except KeyError:
             raise NotImplementedError(f"Scheme '{scheme}' currently not supported")
 
-        cls = matcher.database_cls
-
         if scheme == "databricks":
             assert not dsn.user
             kw = {}
@@ -175,6 +171,7 @@ def connect_to_uri(self, db_uri: str, thread_count: Optional[int] = 1, **kwargs)
             kw["filepath"] = dsn.dbname
             kw["dbname"] = dsn.user
         else:
+            matcher = MatchUriPath(cls)
             kw = matcher.match_path(dsn)
 
             if scheme == "bigquery":
@@ -198,7 +195,7 @@ def connect_to_uri(self, db_uri: str, thread_count: Optional[int] = 1, **kwargs)
 
         kw = {k: v for k, v in kw.items() if v is not None}
 
-        if issubclass(cls, ThreadedDatabase):
+        if isinstance(cls, type) and issubclass(cls, ThreadedDatabase):
             db = cls(thread_count=thread_count, **kw, **kwargs)
         else:
             db = cls(**kw, **kwargs)
@@ -209,11 +206,10 @@ def connect_with_dict(self, d, thread_count, **kwargs):
         d = dict(d)
         driver = d.pop("driver")
         try:
-            matcher = self.match_uri_path[driver]
+            cls = self.database_by_scheme[driver]
         except KeyError:
             raise NotImplementedError(f"Driver '{driver}' currently not supported")
 
-        cls = matcher.database_cls
         if issubclass(cls, ThreadedDatabase):
             db = cls(thread_count=thread_count, **d, **kwargs)
         else:
diff --git a/data_diff/databases/base.py b/data_diff/databases/base.py
@@ -1093,11 +1093,7 @@ def _refine_coltypes(
             list,
             log_message=table_path,
         )
-        if not samples_by_row:
-            raise ValueError(f"Table {table_path} is empty.")
-
-        samples_by_col = list(zip(*samples_by_row))
-
+        samples_by_col = list(zip(*samples_by_row)) if samples_by_row else [[]] * len(text_columns)
         for col_name, samples in safezip(text_columns, samples_by_col):
             uuid_samples = [s for s in samples if s and is_uuid(s)]
 
diff --git a/data_diff/hashdiff_tables.py b/data_diff/hashdiff_tables.py
@@ -118,14 +118,6 @@ def _validate_and_adjust_columns(self, table1: TableSegment, table2: TableSegmen
                 if lowest.precision != col2.precision:
                     table2._schema[c2] = attrs.evolve(col2, precision=lowest.precision)
 
-            elif isinstance(col1, ColType_UUID):
-                if strict and not isinstance(col2, ColType_UUID):
-                    raise TypeError(f"Incompatible types for column '{c1}':  {col1} <-> {col2}")
-
-            elif isinstance(col1, StringType):
-                if strict and not isinstance(col2, StringType):
-                    raise TypeError(f"Incompatible types for column '{c1}':  {col1} <-> {col2}")
-
         for t in [table1, table2]:
             for c in t.relevant_columns:
                 ctype = t._schema[c]
diff --git a/tests/test_diff_tables.py b/tests/test_diff_tables.py
@@ -696,10 +696,12 @@ def setUp(self):
         self.differ = HashDiffer(bisection_factor=2)
 
     def test_right_table_empty(self):
-        self.assertRaises(ValueError, list, self.differ.diff_tables(self.a, self.b))
+        # NotImplementedError: Cannot use a column of type Text(_notes=[]) as a key
+        self.assertRaises(NotImplementedError, list, self.differ.diff_tables(self.a, self.b))
 
     def test_left_table_empty(self):
-        self.assertRaises(ValueError, list, self.differ.diff_tables(self.a, self.b))
+        # NotImplementedError: Cannot use a column of type Text(_notes=[]) as a key
+        self.assertRaises(NotImplementedError, list, self.differ.diff_tables(self.a, self.b))
 
 
 class TestInfoTree(DiffTestCase):