Restrict bisection threshold to pure integers, disable bisection only explicitly

Sergey Vasilyev · Sergey Vasilyev · commit 34f84037bece · 2023-10-18T17:45:30.000+02:00
diff --git a/data_diff/hashdiff_tables.py b/data_diff/hashdiff_tables.py
@@ -71,7 +71,8 @@ class HashDiffer(TableDiffer):
     """
 
     bisection_factor: int = DEFAULT_BISECTION_FACTOR
-    bisection_threshold: Number = DEFAULT_BISECTION_THRESHOLD  # Accepts inf for tests
+    bisection_threshold: int = DEFAULT_BISECTION_THRESHOLD
+    bisection_disabled: bool = False  # i.e. always download the rows (used in tests)
 
     stats: dict = attrs.field(factory=dict)
 
@@ -157,7 +158,7 @@ def _diff_segments(
         # default, data-diff will checksum the section first (when it's below
         # the threshold) and _then_ download it.
         if BENCHMARK:
-            if max_rows < self.bisection_threshold:
+            if self.bisection_disabled or max_rows < self.bisection_threshold:
                 return self._bisect_and_diff_segments(ti, table1, table2, info_tree, level=level, max_rows=max_rows)
 
         (count1, checksum1), (count2, checksum2) = self._threaded_call("count_and_checksum", [table1, table2])
@@ -202,7 +203,7 @@ def _bisect_and_diff_segments(
 
         # If count is below the threshold, just download and compare the columns locally
         # This saves time, as bisection speed is limited by ping and query performance.
-        if max_rows < self.bisection_threshold or max_space_size < self.bisection_factor * 2:
+        if self.bisection_disabled or max_rows < self.bisection_threshold or max_space_size < self.bisection_factor * 2:
             rows1, rows2 = self._threaded_call("get_values", [table1, table2])
             json_cols = {
                 i: colname
diff --git a/tests/test_database_types.py b/tests/test_database_types.py
@@ -1,8 +1,8 @@
+import sys
 import unittest
 import time
 import json
 import re
-import math
 import uuid
 from datetime import datetime, timedelta, timezone
 import logging
@@ -765,10 +765,13 @@ def test_types(self, source_db, target_db, source_type, target_type, type_catego
         # reasonable amount of rows each. These will then be downloaded in
         # parallel, using the existing implementation.
         dl_factor = max(int(N_SAMPLES / 100_000), 2) if BENCHMARK else 2
-        dl_threshold = int(N_SAMPLES / dl_factor) + 1 if BENCHMARK else math.inf
+        dl_threshold = int(N_SAMPLES / dl_factor) + 1 if BENCHMARK else sys.maxsize
         dl_threads = N_THREADS
         differ = HashDiffer(
-            bisection_threshold=dl_threshold, bisection_factor=dl_factor, max_threadpool_size=dl_threads
+            bisection_factor=dl_factor,
+            bisection_threshold=dl_threshold,
+            bisection_disabled=True,
+            max_threadpool_size=dl_threads,
         )
         start = time.monotonic()
         diff = list(differ.diff_tables(self.table, self.table2))