1
- from typing import Tuple , Iterator , Optional , Union
1
+ from typing import Sequence , Tuple , Iterator , Optional , Union
2
2
3
3
from .tracking import disable_tracking
4
4
from .databases .connect import connect
5
5
from .databases .database_types import DbKey , DbTime , DbPath
6
- from .diff_tables import TableSegment , TableDiffer , DEFAULT_BISECTION_THRESHOLD , DEFAULT_BISECTION_FACTOR
6
+ from .diff_tables import Algorithm
7
+ from .hashdiff_tables import HashDiffer , DEFAULT_BISECTION_THRESHOLD , DEFAULT_BISECTION_FACTOR
8
+ from .joindiff_tables import JoinDiffer
9
+ from .table_segment import TableSegment
7
10
8
11
9
12
def connect_to_table (
10
13
db_info : Union [str , dict ],
11
14
table_name : Union [DbPath , str ],
12
- key_column : str = "id" ,
15
+ key_columns : str = ( "id" ,) ,
13
16
thread_count : Optional [int ] = 1 ,
14
17
** kwargs ,
15
- ):
18
+ ) -> TableSegment :
16
19
"""Connects to the given database, and creates a TableSegment instance
17
20
18
21
Parameters:
19
22
db_info: Either a URI string, or a dict of connection options.
20
23
table_name: Name of the table as a string, or a tuple that signifies the path.
21
- key_column: Name of the key column
22
- thread_count: Number of threads for this connection (only if using a threadpooled implementation)
24
+ key_columns: Names of the key columns
25
+ thread_count: Number of threads for this connection (only if using a threadpooled db implementation)
26
+
27
+ See Also:
28
+ :meth:`connect`
23
29
"""
30
+ if isinstance (key_columns , str ):
31
+ key_columns = (key_columns ,)
24
32
25
33
db = connect (db_info , thread_count = thread_count )
26
34
27
35
if isinstance (table_name , str ):
28
36
table_name = db .parse_table_name (table_name )
29
37
30
- return TableSegment (db , table_name , key_column , ** kwargs )
38
+ return TableSegment (db , table_name , key_columns , ** kwargs )
31
39
32
40
33
41
def diff_tables (
34
42
table1 : TableSegment ,
35
43
table2 : TableSegment ,
36
44
* ,
37
45
# Name of the key column, which uniquely identifies each row (usually id)
38
- key_column : str = None ,
46
+ key_columns : Sequence [ str ] = None ,
39
47
# Name of updated column, which signals that rows changed (usually updated_at or last_update)
40
48
update_column : str = None ,
41
49
# Extra columns to compare
@@ -46,31 +54,63 @@ def diff_tables(
46
54
# Start/end update_column values, used to restrict the segment
47
55
min_update : DbTime = None ,
48
56
max_update : DbTime = None ,
49
- # Into how many segments to bisect per iteration
57
+ # Algorithm
58
+ algorithm : Algorithm = Algorithm .HASHDIFF ,
59
+ # Into how many segments to bisect per iteration (hashdiff only)
50
60
bisection_factor : int = DEFAULT_BISECTION_FACTOR ,
51
- # When should we stop bisecting and compare locally (in row count)
61
+ # When should we stop bisecting and compare locally (in row count; hashdiff only )
52
62
bisection_threshold : int = DEFAULT_BISECTION_THRESHOLD ,
53
63
# Enable/disable threaded diffing. Needed to take advantage of database threads.
54
64
threaded : bool = True ,
55
65
# Maximum size of each threadpool. None = auto. Only relevant when threaded is True.
56
66
# There may be many pools, so number of actual threads can be a lot higher.
57
67
max_threadpool_size : Optional [int ] = 1 ,
58
- # Enable/disable debug prints
59
- debug : bool = False ,
60
68
) -> Iterator :
61
- """Efficiently finds the diff between table1 and table2.
69
+ """Finds the diff between table1 and table2.
70
+
71
+ Parameters:
72
+ key_columns (Tuple[str, ...]): Name of the key column, which uniquely identifies each row (usually id)
73
+ update_column (str, optional): Name of updated column, which signals that rows changed.
74
+ Usually updated_at or last_update. Used by `min_update` and `max_update`.
75
+ extra_columns (Tuple[str, ...], optional): Extra columns to compare
76
+ min_key (:data:`DbKey`, optional): Lowest key value, used to restrict the segment
77
+ max_key (:data:`DbKey`, optional): Highest key value, used to restrict the segment
78
+ min_update (:data:`DbTime`, optional): Lowest update_column value, used to restrict the segment
79
+ max_update (:data:`DbTime`, optional): Highest update_column value, used to restrict the segment
80
+ algorithm (:class:`Algorithm`): Which diffing algorithm to use (`HASHDIFF` or `JOINDIFF`)
81
+ bisection_factor (int): Into how many segments to bisect per iteration. (Used when algorithm is `HASHDIFF`)
82
+ bisection_threshold (Number): Minimal row count of segment to bisect, otherwise download
83
+ and compare locally. (Used when algorithm is `HASHDIFF`).
84
+ threaded (bool): Enable/disable threaded diffing. Needed to take advantage of database threads.
85
+ max_threadpool_size (int): Maximum size of each threadpool. ``None`` means auto.
86
+ Only relevant when `threaded` is ``True``.
87
+ There may be many pools, so number of actual threads can be a lot higher.
88
+
89
+ Note:
90
+ The following parameters are used to override the corresponding attributes of the given :class:`TableSegment` instances:
91
+ `key_columns`, `update_column`, `extra_columns`, `min_key`, `max_key`.
92
+ If different values are needed per table, it's possible to omit them here, and instead set
93
+ them directly when creating each :class:`TableSegment`.
62
94
63
95
Example:
64
96
>>> table1 = connect_to_table('postgresql:///', 'Rating', 'id')
65
97
>>> list(diff_tables(table1, table1))
66
98
[]
67
99
100
+ See Also:
101
+ :class:`TableSegment`
102
+ :class:`HashDiffer`
103
+ :class:`JoinDiffer`
104
+
68
105
"""
106
+ if isinstance (key_columns , str ):
107
+ key_columns = (key_columns ,)
108
+
69
109
tables = [table1 , table2 ]
70
110
override_attrs = {
71
111
k : v
72
112
for k , v in dict (
73
- key_column = key_column ,
113
+ key_columns = key_columns ,
74
114
update_column = update_column ,
75
115
extra_columns = extra_columns ,
76
116
min_key = min_key ,
@@ -83,11 +123,20 @@ def diff_tables(
83
123
84
124
segments = [t .new (** override_attrs ) for t in tables ] if override_attrs else tables
85
125
86
- differ = TableDiffer (
87
- bisection_factor = bisection_factor ,
88
- bisection_threshold = bisection_threshold ,
89
- debug = debug ,
90
- threaded = threaded ,
91
- max_threadpool_size = max_threadpool_size ,
92
- )
126
+ algorithm = Algorithm (algorithm )
127
+ if algorithm == Algorithm .HASHDIFF :
128
+ differ = HashDiffer (
129
+ bisection_factor = bisection_factor ,
130
+ bisection_threshold = bisection_threshold ,
131
+ threaded = threaded ,
132
+ max_threadpool_size = max_threadpool_size ,
133
+ )
134
+ elif algorithm == Algorithm .JOINDIFF :
135
+ differ = JoinDiffer (
136
+ threaded = threaded ,
137
+ max_threadpool_size = max_threadpool_size ,
138
+ )
139
+ else :
140
+ raise ValueError (f"Unknown algorithm: { algorithm } " )
141
+
93
142
return differ .diff_tables (* segments )
0 commit comments