Skip to content

Commit 7650ecd

Browse files
committed
Add optional callback for diff progress
1 parent 269df51 commit 7650ecd

File tree

3 files changed

+66
-19
lines changed

3 files changed

+66
-19
lines changed

README.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,11 @@ A.sync_from(B)
2222
A.sync_to(B)
2323
```
2424

25+
You may wish to peruse the [`diffsync` GitHub topic](https://github.com/topics/diffsync) for examples of projects using this library.
26+
2527
# Getting started
2628

27-
To be able to properly compare different datasets, DiffSync relies on a shared datamodel that both systems must use.
29+
To be able to properly compare different datasets, DiffSync relies on a shared data model that both systems must use.
2830
Specifically, each system or dataset must provide a `DiffSync` "adapter" subclass, which in turn represents its dataset as instances of one or more `DiffSyncModel` data model classes.
2931

3032
When comparing two systems, DiffSync detects the intersection between the two systems (which data models they have in common, and which attributes are shared between each pair of data models) and uses this intersection to compare and/or synchronize the data.
@@ -39,9 +41,9 @@ Each `DiffSyncModel` subclass supports the following class-level attributes:
3941
- `_attributes` - List of non-identifier instance field names for this object; used to identify the fields in common between data models for different systems (Optional)
4042
- `_children` - Dict of `{<model_name>: <field_name>}` indicating which fields store references to child data model instances. (Optional)
4143

42-
> DiffSyncModel instances must be uniquely identified by their unique id, composed of all fields defined in `_identifiers`. The unique id must be globally meaningful (such as an unique instance name or slug), as it is used to identify object correspondence between differing systems or data sets. It **must not** be a value that is only locally meaningful, such as a database primary key integer value.
44+
> DiffSyncModel instances must be uniquely identified by their unique ID (or, in database terminology, [natural key](https://en.wikipedia.org/wiki/Natural_key)), which is composed of the union of all fields defined in `_identifiers`. The unique ID must be globally meaningful (such as an unique instance name or slug), as it is used to identify object correspondence between differing systems or data sets. It **must not** be a value that is only locally meaningful to a specific data set, such as a database primary key value.
4345
44-
> Only fields listed in `_identifiers`, `_attributes`, or `_children` will be potentially included in comparison and synchronization between systems or data sets. Any other fields will be ignored; this allows for a model to additionally contain fields that are only locally relevant (such as database primary key values) and therefore are irrelevant to comparisons.
46+
> Only fields listed in `_identifiers`, `_attributes`, or `_children` will be potentially included in comparison and synchronization between systems or data sets. Any other fields will be ignored; this allows for a model to additionally contain fields that are only locally relevant (such as database primary key values) and therefore are irrelevant to comparison and synchronization.
4547
4648
```python
4749
from typing import List, Optional

diffsync/__init__.py

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""DiffSync front-end classes and logic.
22
3-
Copyright (c) 2020 Network To Code, LLC <info@networktocode.com>
3+
Copyright (c) 2020-2021 Network To Code, LLC <info@networktocode.com>
44
55
Licensed under the Apache License, Version 2.0 (the "License");
66
you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
1616
"""
1717
from collections import defaultdict
1818
from inspect import isclass
19-
from typing import ClassVar, Dict, List, Mapping, MutableMapping, Optional, Text, Tuple, Type, Union
19+
from typing import Callable, ClassVar, Dict, List, Mapping, MutableMapping, Optional, Text, Tuple, Type, Union
2020

2121
from pydantic import BaseModel, PrivateAttr
2222
import structlog # type: ignore
@@ -359,7 +359,7 @@ def remove_child(self, child: "DiffSyncModel"):
359359
class DiffSync:
360360
"""Class for storing a group of DiffSyncModel instances and diffing/synchronizing to another DiffSync instance."""
361361

362-
# Add mapping of names to specific model classes here:
362+
# In any subclass, you would add mapping of names to specific model classes here:
363363
# modelname1 = MyModelClass1
364364
# modelname2 = MyModelClass2
365365

@@ -418,6 +418,10 @@ def __str__(self):
418418
def __repr__(self):
419419
return f"<{str(self)}>"
420420

421+
def __len__(self):
422+
"""Total number of elements stored in self._data."""
423+
return sum(len(entries) for entries in self._data.values())
424+
421425
def load(self):
422426
"""Load all desired data from whatever backend data source into this instance."""
423427
# No-op in this generic class
@@ -502,29 +506,43 @@ def sync_complete(
502506
# ------------------------------------------------------------------------------
503507

504508
def diff_from(
505-
self, source: "DiffSync", diff_class: Type[Diff] = Diff, flags: DiffSyncFlags = DiffSyncFlags.NONE
509+
self,
510+
source: "DiffSync",
511+
diff_class: Type[Diff] = Diff,
512+
flags: DiffSyncFlags = DiffSyncFlags.NONE,
513+
callback: Optional[Callable[[int, int], None]] = None,
506514
) -> Diff:
507515
"""Generate a Diff describing the difference from the other DiffSync to this one.
508516
509517
Args:
510518
source (DiffSync): Object to diff against.
511519
diff_class (class): Diff or subclass thereof to use for diff calculation and storage.
512520
flags (DiffSyncFlags): Flags influencing the behavior of this diff operation.
521+
callback (function): Function with parameters (current, total), to be called at intervals as the
522+
calculation of the diff proceeds.
513523
"""
514-
differ = DiffSyncDiffer(src_diffsync=source, dst_diffsync=self, flags=flags, diff_class=diff_class)
524+
differ = DiffSyncDiffer(
525+
src_diffsync=source, dst_diffsync=self, flags=flags, diff_class=diff_class, callback=callback
526+
)
515527
return differ.calculate_diffs()
516528

517529
def diff_to(
518-
self, target: "DiffSync", diff_class: Type[Diff] = Diff, flags: DiffSyncFlags = DiffSyncFlags.NONE
530+
self,
531+
target: "DiffSync",
532+
diff_class: Type[Diff] = Diff,
533+
flags: DiffSyncFlags = DiffSyncFlags.NONE,
534+
callback: Optional[Callable[[int, int], None]] = None,
519535
) -> Diff:
520536
"""Generate a Diff describing the difference from this DiffSync to another one.
521537
522538
Args:
523539
target (DiffSync): Object to diff against.
524540
diff_class (class): Diff or subclass thereof to use for diff calculation and storage.
525541
flags (DiffSyncFlags): Flags influencing the behavior of this diff operation.
542+
callback (function): Function with parameters (current, total), to be called at intervals as the
543+
calculation of the diff proceeds.
526544
"""
527-
return target.diff_from(self, diff_class=diff_class, flags=flags)
545+
return target.diff_from(self, diff_class=diff_class, flags=flags, callback=callback)
528546

529547
# ------------------------------------------------------------------------------
530548
# Object Storage Management
@@ -567,21 +585,21 @@ def get(
567585
raise ObjectNotFound(f"{modelname} {uid} not present in {self.name}")
568586
return self._data[modelname][uid]
569587

570-
def get_all(self, obj: Union[Text, DiffSyncModel, Type[DiffSyncModel]]):
588+
def get_all(self, obj: Union[Text, DiffSyncModel, Type[DiffSyncModel]]) -> List[DiffSyncModel]:
571589
"""Get all objects of a given type.
572590
573591
Args:
574592
obj: DiffSyncModel class or instance, or modelname string, that defines the type of the objects to retrieve
575593
576594
Returns:
577-
ValuesList[DiffSyncModel]: List of Object
595+
List[DiffSyncModel]: List of Object
578596
"""
579597
if isinstance(obj, str):
580598
modelname = obj
581599
else:
582600
modelname = obj.get_type()
583601

584-
return self._data[modelname].values()
602+
return list(self._data[modelname].values())
585603

586604
def get_by_uids(
587605
self, uids: List[Text], obj: Union[Text, DiffSyncModel, Type[DiffSyncModel]]

diffsync/helpers.py

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""DiffSync helper classes for calculating and performing diff and sync operations.
22
3-
Copyright (c) 2020 Network To Code, LLC <info@networktocode.com>
3+
Copyright (c) 2020-2021 Network To Code, LLC <info@networktocode.com>
44
55
Licensed under the Apache License, Version 2.0 (the "License");
66
you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
1515
limitations under the License.
1616
"""
1717
from collections.abc import Iterable as ABCIterable, Mapping as ABCMapping
18-
from typing import Iterable, List, Mapping, Optional, Tuple, Type, TYPE_CHECKING
18+
from typing import Callable, Iterable, List, Mapping, Optional, Tuple, Type, TYPE_CHECKING
1919

2020
import structlog # type: ignore
2121

@@ -29,14 +29,19 @@
2929
from . import DiffSync, DiffSyncModel # pylint: disable=cyclic-import
3030

3131

32-
class DiffSyncDiffer:
32+
class DiffSyncDiffer: # pylint: disable=too-many-instance-attributes
3333
"""Helper class implementing diff calculation logic for DiffSync.
3434
3535
Independent from Diff and DiffElement as those classes are purely data objects, while this stores some state.
3636
"""
3737

38-
def __init__(
39-
self, src_diffsync: "DiffSync", dst_diffsync: "DiffSync", flags: DiffSyncFlags, diff_class: Type[Diff] = Diff
38+
def __init__( # pylint: disable=too-many-arguments
39+
self,
40+
src_diffsync: "DiffSync",
41+
dst_diffsync: "DiffSync",
42+
flags: DiffSyncFlags,
43+
diff_class: Type[Diff] = Diff,
44+
callback: Optional[Callable[[int, int], None]] = None,
4045
):
4146
"""Create a DiffSyncDiffer for calculating diffs between the provided DiffSync instances."""
4247
self.src_diffsync = src_diffsync
@@ -45,13 +50,27 @@ def __init__(
4550

4651
self.logger = structlog.get_logger().new(src=src_diffsync, dst=dst_diffsync, flags=flags)
4752
self.diff_class = diff_class
53+
self.callback = callback
4854
self.diff: Optional[Diff] = None
4955

56+
self.models_processed = 0
57+
self.total_models = len(src_diffsync) + len(dst_diffsync)
58+
self.logger.debug(f"Diff calculation between these two datasets will involve {self.total_models} models")
59+
60+
def incr_models_processed(self, delta: int = 1):
61+
"""Increment self.models_processed, then call self.callback if present."""
62+
if delta:
63+
self.models_processed += delta
64+
if self.callback:
65+
self.callback(self.models_processed, self.total_models)
66+
5067
def calculate_diffs(self) -> Diff:
5168
"""Calculate diffs between the src and dst DiffSync objects and return the resulting Diff."""
5269
if self.diff is not None:
5370
return self.diff
5471

72+
self.models_processed = 0
73+
5574
self.logger.info("Beginning diff calculation")
5675
self.diff = self.diff_class()
5776
for obj_type in intersection(self.dst_diffsync.top_level, self.src_diffsync.top_level):
@@ -66,7 +85,7 @@ def calculate_diffs(self) -> Diff:
6685
self.diff.complete()
6786
return self.diff
6887

69-
def diff_object_list(self, src: Iterable["DiffSyncModel"], dst: Iterable["DiffSyncModel"]) -> List[DiffElement]:
88+
def diff_object_list(self, src: List["DiffSyncModel"], dst: List["DiffSyncModel"]) -> List[DiffElement]:
7089
"""Calculate diffs between two lists of like objects.
7190
7291
Helper method to `calculate_diffs`, usually doesn't need to be called directly.
@@ -90,6 +109,9 @@ def diff_object_list(self, src: Iterable["DiffSyncModel"], dst: Iterable["DiffSy
90109
# In the future we might support set, etc...
91110
raise TypeError(f"Type combination {type(src)}/{type(dst)} is not supported... for now")
92111

112+
# Any non-intersection between src and dst can be counted as "processed" and done.
113+
self.incr_models_processed(max(len(src) - len(combined_dict), 0) + max(len(dst) - len(combined_dict), 0))
114+
93115
self.validate_objects_for_diff(combined_dict.values())
94116

95117
for uid in combined_dict:
@@ -168,10 +190,15 @@ def diff_object_pair(
168190
diff_class=self.diff_class,
169191
)
170192

193+
delta = 0
171194
if src_obj:
172195
diff_element.add_attrs(source=src_obj.get_attrs(), dest=None)
196+
delta += 1
173197
if dst_obj:
174198
diff_element.add_attrs(source=None, dest=dst_obj.get_attrs())
199+
delta += 1
200+
201+
self.incr_models_processed(delta)
175202

176203
# Recursively diff the children of src_obj and dst_obj and attach the resulting diffs to the diff_element
177204
self.diff_child_objects(diff_element, src_obj, dst_obj)

0 commit comments

Comments
 (0)