Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 6738ca7

Browse files
authored
Merge pull request #802 from datafold/tolerate-empty-samples
Tolerate empty samples & allow custom database schemas
2 parents c1dde75 + f1ef90a commit 6738ca7

File tree

4 files changed

+10
-24
lines changed

4 files changed

+10
-24
lines changed

data_diff/databases/_connect.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from data_diff.databases.mssql import MsSQL
2727

2828

29-
@attrs.define(frozen=True)
29+
@attrs.frozen
3030
class MatchUriPath:
3131
database_cls: Type[Database]
3232

@@ -98,13 +98,11 @@ class Connect:
9898
"""Provides methods for connecting to a supported database using a URL or connection dict."""
9999

100100
database_by_scheme: Dict[str, Database]
101-
match_uri_path: Dict[str, MatchUriPath]
102101
conn_cache: MutableMapping[Hashable, Database]
103102

104103
def __init__(self, database_by_scheme: Dict[str, Database] = DATABASE_BY_SCHEME):
105104
super().__init__()
106105
self.database_by_scheme = database_by_scheme
107-
self.match_uri_path = {name: MatchUriPath(cls) for name, cls in database_by_scheme.items()}
108106
self.conn_cache = weakref.WeakValueDictionary()
109107

110108
def for_databases(self, *dbs) -> Self:
@@ -157,12 +155,10 @@ def connect_to_uri(self, db_uri: str, thread_count: Optional[int] = 1, **kwargs)
157155
return self.connect_with_dict(conn_dict, thread_count, **kwargs)
158156

159157
try:
160-
matcher = self.match_uri_path[scheme]
158+
cls = self.database_by_scheme[scheme]
161159
except KeyError:
162160
raise NotImplementedError(f"Scheme '{scheme}' currently not supported")
163161

164-
cls = matcher.database_cls
165-
166162
if scheme == "databricks":
167163
assert not dsn.user
168164
kw = {}
@@ -175,6 +171,7 @@ def connect_to_uri(self, db_uri: str, thread_count: Optional[int] = 1, **kwargs)
175171
kw["filepath"] = dsn.dbname
176172
kw["dbname"] = dsn.user
177173
else:
174+
matcher = MatchUriPath(cls)
178175
kw = matcher.match_path(dsn)
179176

180177
if scheme == "bigquery":
@@ -198,7 +195,7 @@ def connect_to_uri(self, db_uri: str, thread_count: Optional[int] = 1, **kwargs)
198195

199196
kw = {k: v for k, v in kw.items() if v is not None}
200197

201-
if issubclass(cls, ThreadedDatabase):
198+
if isinstance(cls, type) and issubclass(cls, ThreadedDatabase):
202199
db = cls(thread_count=thread_count, **kw, **kwargs)
203200
else:
204201
db = cls(**kw, **kwargs)
@@ -209,11 +206,10 @@ def connect_with_dict(self, d, thread_count, **kwargs):
209206
d = dict(d)
210207
driver = d.pop("driver")
211208
try:
212-
matcher = self.match_uri_path[driver]
209+
cls = self.database_by_scheme[driver]
213210
except KeyError:
214211
raise NotImplementedError(f"Driver '{driver}' currently not supported")
215212

216-
cls = matcher.database_cls
217213
if issubclass(cls, ThreadedDatabase):
218214
db = cls(thread_count=thread_count, **d, **kwargs)
219215
else:

data_diff/databases/base.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1093,11 +1093,7 @@ def _refine_coltypes(
10931093
list,
10941094
log_message=table_path,
10951095
)
1096-
if not samples_by_row:
1097-
raise ValueError(f"Table {table_path} is empty.")
1098-
1099-
samples_by_col = list(zip(*samples_by_row))
1100-
1096+
samples_by_col = list(zip(*samples_by_row)) if samples_by_row else [[]] * len(text_columns)
11011097
for col_name, samples in safezip(text_columns, samples_by_col):
11021098
uuid_samples = [s for s in samples if s and is_uuid(s)]
11031099

data_diff/hashdiff_tables.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -118,14 +118,6 @@ def _validate_and_adjust_columns(self, table1: TableSegment, table2: TableSegmen
118118
if lowest.precision != col2.precision:
119119
table2._schema[c2] = attrs.evolve(col2, precision=lowest.precision)
120120

121-
elif isinstance(col1, ColType_UUID):
122-
if strict and not isinstance(col2, ColType_UUID):
123-
raise TypeError(f"Incompatible types for column '{c1}': {col1} <-> {col2}")
124-
125-
elif isinstance(col1, StringType):
126-
if strict and not isinstance(col2, StringType):
127-
raise TypeError(f"Incompatible types for column '{c1}': {col1} <-> {col2}")
128-
129121
for t in [table1, table2]:
130122
for c in t.relevant_columns:
131123
ctype = t._schema[c]

tests/test_diff_tables.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -696,10 +696,12 @@ def setUp(self):
696696
self.differ = HashDiffer(bisection_factor=2)
697697

698698
def test_right_table_empty(self):
699-
self.assertRaises(ValueError, list, self.differ.diff_tables(self.a, self.b))
699+
# NotImplementedError: Cannot use a column of type Text(_notes=[]) as a key
700+
self.assertRaises(NotImplementedError, list, self.differ.diff_tables(self.a, self.b))
700701

701702
def test_left_table_empty(self):
702-
self.assertRaises(ValueError, list, self.differ.diff_tables(self.a, self.b))
703+
# NotImplementedError: Cannot use a column of type Text(_notes=[]) as a key
704+
self.assertRaises(NotImplementedError, list, self.differ.diff_tables(self.a, self.b))
703705

704706

705707
class TestInfoTree(DiffTestCase):

0 commit comments

Comments
 (0)