Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 34f8403

Browse files
author
Sergey Vasilyev
committed
Restrict bisection threshold to pure integers, disable bisection only explicitly
1 parent 598d98c commit 34f8403

File tree

2 files changed

+10
-6
lines changed

2 files changed

+10
-6
lines changed

data_diff/hashdiff_tables.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,8 @@ class HashDiffer(TableDiffer):
7171
"""
7272

7373
bisection_factor: int = DEFAULT_BISECTION_FACTOR
74-
bisection_threshold: Number = DEFAULT_BISECTION_THRESHOLD # Accepts inf for tests
74+
bisection_threshold: int = DEFAULT_BISECTION_THRESHOLD
75+
bisection_disabled: bool = False # i.e. always download the rows (used in tests)
7576

7677
stats: dict = attrs.field(factory=dict)
7778

@@ -157,7 +158,7 @@ def _diff_segments(
157158
# default, data-diff will checksum the section first (when it's below
158159
# the threshold) and _then_ download it.
159160
if BENCHMARK:
160-
if max_rows < self.bisection_threshold:
161+
if self.bisection_disabled or max_rows < self.bisection_threshold:
161162
return self._bisect_and_diff_segments(ti, table1, table2, info_tree, level=level, max_rows=max_rows)
162163

163164
(count1, checksum1), (count2, checksum2) = self._threaded_call("count_and_checksum", [table1, table2])
@@ -202,7 +203,7 @@ def _bisect_and_diff_segments(
202203

203204
# If count is below the threshold, just download and compare the columns locally
204205
# This saves time, as bisection speed is limited by ping and query performance.
205-
if max_rows < self.bisection_threshold or max_space_size < self.bisection_factor * 2:
206+
if self.bisection_disabled or max_rows < self.bisection_threshold or max_space_size < self.bisection_factor * 2:
206207
rows1, rows2 = self._threaded_call("get_values", [table1, table2])
207208
json_cols = {
208209
i: colname

tests/test_database_types.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1+
import sys
12
import unittest
23
import time
34
import json
45
import re
5-
import math
66
import uuid
77
from datetime import datetime, timedelta, timezone
88
import logging
@@ -765,10 +765,13 @@ def test_types(self, source_db, target_db, source_type, target_type, type_catego
765765
# reasonable amount of rows each. These will then be downloaded in
766766
# parallel, using the existing implementation.
767767
dl_factor = max(int(N_SAMPLES / 100_000), 2) if BENCHMARK else 2
768-
dl_threshold = int(N_SAMPLES / dl_factor) + 1 if BENCHMARK else math.inf
768+
dl_threshold = int(N_SAMPLES / dl_factor) + 1 if BENCHMARK else sys.maxsize
769769
dl_threads = N_THREADS
770770
differ = HashDiffer(
771-
bisection_threshold=dl_threshold, bisection_factor=dl_factor, max_threadpool_size=dl_threads
771+
bisection_factor=dl_factor,
772+
bisection_threshold=dl_threshold,
773+
bisection_disabled=True,
774+
max_threadpool_size=dl_threads,
772775
)
773776
start = time.monotonic()
774777
diff = list(differ.diff_tables(self.table, self.table2))

0 commit comments

Comments
 (0)