Review feedback

Ubuntu · Ubuntu · commit 0bdd631d8449 · 2022-03-15T17:36:37.000Z
diff --git a/nucleus/annotation.py b/nucleus/annotation.py
@@ -71,7 +71,13 @@ def to_json(self) -> str:
         """Serializes annotation object to schematized JSON string."""
         return json.dumps(self.to_payload(), allow_nan=False)
 
-    def has_local_files(self) -> bool:
+    def has_local_files_to_upload(self) -> bool:
+        """Returns True if annotation has local files that need to be uploaded.
+
+        Nearly all subclasses have no local files, so we default this to just return
+        false. If the subclass has local files, it should override this method (but
+        that is not the only thing required to get local upload of files to work.)
+        """
         return False
 
 
@@ -582,7 +588,8 @@ def to_payload(self) -> dict:
 
         return payload
 
-    def has_local_files(self) -> bool:
+    def has_local_files_to_upload(self) -> bool:
+        """Check if the mask url is local and needs to be uploaded."""
         if is_local_path(self.mask_url):
             if not os.path.isfile(self.mask_url):
                 raise Exception(f"Mask file {self.mask_url} does not exist.")
diff --git a/nucleus/annotation_uploader.py b/nucleus/annotation_uploader.py
@@ -32,6 +32,11 @@ def accumulate_dict_values(dicts: Iterable[dict]):
 
 
 class AnnotationUploader:
+    """This is a helper class not intended for direct use. Please use dataset.annotate.
+
+    This class is purely a helper class for implementing dataset.annotate.
+    """
+
     def __init__(self, dataset_id: str, client: "NucleusClient"):  # noqa: F821
         self.dataset_id = dataset_id
         self._client = client
@@ -45,14 +50,15 @@ def upload(
         local_files_per_upload_request: int = 10,
         local_file_upload_concurrency: int = 30,
     ):
+        """For more details on parameters and functionality, see dataset.annotate."""
         if local_files_per_upload_request > 10:
             raise ValueError("local_files_per_upload_request must be <= 10")
         annotations_without_files: List[Annotation] = []
         segmentations_with_local_files: List[SegmentationAnnotation] = []
         segmentations_with_remote_files: List[SegmentationAnnotation] = []
 
         for annotation in annotations:
-            if annotation.has_local_files():
+            if annotation.has_local_files_to_upload():
                 # Only segmentations have local files currently, and probably for a long
                 # time to to come.
                 assert isinstance(annotation, SegmentationAnnotation)
diff --git a/nucleus/async_utils.py b/nucleus/async_utils.py
@@ -29,6 +29,7 @@ class FileFormField:
 
 
 async def gather_with_concurrency(n, *tasks):
+    """Helper method to limit the concurrency when gathering the results from multiple tasks."""
     semaphore = asyncio.Semaphore(n)
 
     async def sem_task(task):
@@ -107,6 +108,9 @@ def make_many_form_data_requests_concurrently(
         requests: Each requst should be a FormDataContextHandler object which will
             handle generating form data, and opening/closing files for each request.
         route: route for the request.
+        progressbar: A tqdm progress bar to use for showing progress to the user.
+        concurrency: How many concurrent requests to run at once. Should be exposed
+            to the user.
     """
     loop = get_event_loop()
     return loop.run_until_complete(
@@ -168,7 +172,7 @@ async def _post_form_data(
 
     logger.info("Posting to %s", endpoint)
 
-    for sleep_time in RetryStrategy.sleep_times + [-1]:
+    for sleep_time in RetryStrategy.sleep_times() + [-1]:
         with request as form:
             async with session.post(
                 endpoint,
diff --git a/nucleus/connection.py b/nucleus/connection.py
@@ -55,7 +55,7 @@ def make_request(
 
         logger.info("Make request to %s", endpoint)
 
-        for retry_wait_time in RetryStrategy.sleep_times:
+        for retry_wait_time in RetryStrategy.sleep_times():
             response = requests_command(
                 endpoint,
                 json=payload,
diff --git a/nucleus/retry_strategy.py b/nucleus/retry_strategy.py
@@ -1,4 +1,12 @@
 # TODO: use retry library instead of custom code. Tenacity is one option.
+import random
+
+
 class RetryStrategy:
     statuses = {503, 524, 520, 504}
-    sleep_times = [1, 3, 9, 27]  # These are in seconds
+
+    @staticmethod
+    def sleep_times():
+        sleep_times = [1, 3, 9, 27]  # These are in seconds
+
+        return [2 * random.random() * t for t in sleep_times]
diff --git a/tests/helpers.py b/tests/helpers.py
@@ -304,6 +304,8 @@ def reference_id_from_url(url):
 this_dir = os.path.dirname(os.path.realpath(__file__))
 TEST_LOCAL_MASK_URL = os.path.join(this_dir, "testdata/000000000285.png")
 
+
+NUM_VALID_SEGMENTATIONS_IN_MAIN_DATASET = len(TEST_DATASET_ITEMS)
 TEST_SEGMENTATION_ANNOTATIONS = [
     {
         "reference_id": reference_id_from_url(TEST_IMG_URLS[i]),
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -262,6 +262,7 @@ def test_dataset_append_local(CLIENT, dataset):
             reference_id="bad",
         )
     ]
+    num_local_items_to_test = 10
     with pytest.raises(ValueError) as e:
         dataset.append(ds_items_local_error)
         assert "Out of range float values are not JSON compliant" in str(
@@ -273,15 +274,15 @@ def test_dataset_append_local(CLIENT, dataset):
             metadata={"test": 0},
             reference_id=LOCAL_FILENAME.split("/")[-1] + str(i),
         )
-        for i in range(1000)
+        for i in range(num_local_items_to_test)
     ]
 
     response = dataset.append(ds_items_local)
 
     assert isinstance(response, UploadResponse)
     resp_json = response.json()
     assert resp_json[DATASET_ID_KEY] == dataset.id
-    assert resp_json[NEW_ITEMS] == 1000
+    assert resp_json[NEW_ITEMS] == num_local_items_to_test
     assert resp_json[UPDATED_ITEMS] == 0
     assert resp_json[IGNORED_ITEMS] == 0
     assert resp_json[ERROR_ITEMS] == 0
diff --git a/tests/test_segmentation.py b/tests/test_segmentation.py
@@ -1,6 +1,7 @@
 from nucleus.annotation import SegmentationAnnotation
 from nucleus.dataset import Dataset
 from tests.helpers import (
+    NUM_VALID_SEGMENTATIONS_IN_MAIN_DATASET,
     TEST_LOCAL_MASK_URL,
     TEST_SEGMENTATION_ANNOTATIONS,
     assert_segmentation_annotation_matches_dict,
@@ -38,15 +39,17 @@ def test_batch_local_semseg_gt_upload(dataset: Dataset):
         request_annotation.mask_url = TEST_LOCAL_MASK_URL
     response = dataset.annotate(annotations=request_annotations)
 
-    print(request_annotations)
-    print(response)
-
     assert response["dataset_id"] == dataset.id
-    assert response["annotations_processed"] == 4
+    assert (
+        response["annotations_processed"]
+        == NUM_VALID_SEGMENTATIONS_IN_MAIN_DATASET
+    )
     assert response["annotations_ignored"] == 0
     assert bad_reference_id in response["errors"][0]
 
-    for request_annotation in request_annotations[:4]:
+    for request_annotation in request_annotations[
+        :NUM_VALID_SEGMENTATIONS_IN_MAIN_DATASET
+    ]:
         response_annotation = dataset.refloc(request_annotation.reference_id)[
             "annotations"
         ]["segmentation"][0]
@@ -78,7 +81,10 @@ def test_batch_semseg_gt_upload(dataset):
     ]
     response = dataset.annotate(annotations=annotations)
     assert response["dataset_id"] == dataset.id
-    assert response["annotations_processed"] == 4
+    assert (
+        response["annotations_processed"]
+        == NUM_VALID_SEGMENTATIONS_IN_MAIN_DATASET
+    )
     assert response["annotations_ignored"] == 0
 
 
@@ -90,14 +96,20 @@ def test_batch_semseg_gt_upload_ignore(dataset):
     ]
     response = dataset.annotate(annotations=annotations)
     assert response["dataset_id"] == dataset.id
-    assert response["annotations_processed"] == 4
+    assert (
+        response["annotations_processed"]
+        == NUM_VALID_SEGMENTATIONS_IN_MAIN_DATASET
+    )
     assert response["annotations_ignored"] == 0
 
     # When we re-upload, expect them to be ignored
     response = dataset.annotate(annotations=annotations)
     assert response["dataset_id"] == dataset.id
     assert response["annotations_processed"] == 0
-    assert response["annotations_ignored"] == 4
+    assert (
+        response["annotations_ignored"]
+        == NUM_VALID_SEGMENTATIONS_IN_MAIN_DATASET
+    )
 
 
 def test_batch_semseg_gt_upload_update(dataset):
@@ -108,11 +120,17 @@ def test_batch_semseg_gt_upload_update(dataset):
     ]
     response = dataset.annotate(annotations=annotations)
     assert response["dataset_id"] == dataset.id
-    assert response["annotations_processed"] == 4
+    assert (
+        response["annotations_processed"]
+        == NUM_VALID_SEGMENTATIONS_IN_MAIN_DATASET
+    )
     assert response["annotations_ignored"] == 0
 
     # When we re-upload, expect uploads to be processed
     response = dataset.annotate(annotations=annotations, update=True)
     assert response["dataset_id"] == dataset.id
-    assert response["annotations_processed"] == 4
+    assert (
+        response["annotations_processed"]
+        == NUM_VALID_SEGMENTATIONS_IN_MAIN_DATASET
+    )
     assert response["annotations_ignored"] == 0

Original file line number	Diff line number	Diff line change
`@@ -304,6 +304,8 @@ def reference_id_from_url(url):`
`304`	`304`	`this_dir = os.path.dirname(os.path.realpath(__file__))`
`305`	`305`	`TEST_LOCAL_MASK_URL = os.path.join(this_dir, "testdata/000000000285.png")`
`306`	`306`
	`307`	`+`
	`308`	`+NUM_VALID_SEGMENTATIONS_IN_MAIN_DATASET = len(TEST_DATASET_ITEMS)`
`307`	`309`	`TEST_SEGMENTATION_ANNOTATIONS = [`
`308`	`310`	`{`
`309`	`311`	`"reference_id": reference_id_from_url(TEST_IMG_URLS[i]),`
Original file line number	Diff line number	Diff line change
`@@ -262,6 +262,7 @@ def test_dataset_append_local(CLIENT, dataset):`
`262`	`262`	`reference_id="bad",`
`263`	`263`	`)`
`264`	`264`	`]`
	`265`	`+ num_local_items_to_test = 10`
`265`	`266`	`with pytest.raises(ValueError) as e:`
`266`	`267`	`dataset.append(ds_items_local_error)`
`267`	`268`	`assert "Out of range float values are not JSON compliant" in str(`
`@@ -273,15 +274,15 @@ def test_dataset_append_local(CLIENT, dataset):`
`273`	`274`	`metadata={"test": 0},`
`274`	`275`	`reference_id=LOCAL_FILENAME.split("/")[-1] + str(i),`
`275`	`276`	`)`
`276`		`- for i in range(1000)`
	`277`	`+ for i in range(num_local_items_to_test)`
`277`	`278`	`]`
`278`	`279`
`279`	`280`	`response = dataset.append(ds_items_local)`
`280`	`281`
`281`	`282`	`assert isinstance(response, UploadResponse)`
`282`	`283`	`resp_json = response.json()`
`283`	`284`	`assert resp_json[DATASET_ID_KEY] == dataset.id`
`284`		`- assert resp_json[NEW_ITEMS] == 1000`
	`285`	`+ assert resp_json[NEW_ITEMS] == num_local_items_to_test`
`285`	`286`	`assert resp_json[UPDATED_ITEMS] == 0`
`286`	`287`	`assert resp_json[IGNORED_ITEMS] == 0`
`287`	`288`	`assert resp_json[ERROR_ITEMS] == 0`