Initial pass at client changes for prediction segmentation upload

Ubuntu · Ubuntu · commit 3dce534824e6 · 2022-03-14T23:29:29.000Z
diff --git a/nucleus/__init__.py b/nucleus/__init__.py
@@ -512,93 +512,6 @@ def create_model_run(self, dataset_id: str, payload: dict) -> ModelRun:
             response[MODEL_RUN_ID_KEY], dataset_id=dataset_id, client=self
         )
 
-    @deprecated("Use Dataset.upload_predictions instead.")
-    def predict(
-        self,
-        annotations: List[
-            Union[
-                BoxPrediction,
-                PolygonPrediction,
-                CuboidPrediction,
-                SegmentationPrediction,
-                CategoryPrediction,
-            ]
-        ],
-        model_run_id: Optional[str] = None,
-        model_id: Optional[str] = None,
-        dataset_id: Optional[str] = None,
-        update: bool = False,
-        batch_size: int = 5000,
-    ):
-        if model_run_id is not None:
-            assert model_id is None and dataset_id is None
-            endpoint = f"modelRun/{model_run_id}/predict"
-        else:
-            assert (
-                model_id is not None and dataset_id is not None
-            ), "Model ID and dataset ID are required if not using model run id."
-            endpoint = (
-                f"dataset/{dataset_id}/model/{model_id}/uploadPredictions"
-            )
-        segmentations = [
-            ann
-            for ann in annotations
-            if isinstance(ann, SegmentationPrediction)
-        ]
-
-        other_predictions = [
-            ann
-            for ann in annotations
-            if not isinstance(ann, SegmentationPrediction)
-        ]
-
-        s_batches = [
-            segmentations[i : i + batch_size]
-            for i in range(0, len(segmentations), batch_size)
-        ]
-
-        batches = [
-            other_predictions[i : i + batch_size]
-            for i in range(0, len(other_predictions), batch_size)
-        ]
-
-        errors = []
-        predictions_processed = 0
-        predictions_ignored = 0
-
-        tqdm_batches = self.tqdm_bar(batches)
-
-        for batch in tqdm_batches:
-            batch_payload = construct_box_predictions_payload(
-                batch,
-                update,
-            )
-            response = self.make_request(batch_payload, endpoint)
-            if STATUS_CODE_KEY in response:
-                errors.append(response)
-            else:
-                predictions_processed += response[PREDICTIONS_PROCESSED_KEY]
-                predictions_ignored += response[PREDICTIONS_IGNORED_KEY]
-                if ERRORS_KEY in response:
-                    errors += response[ERRORS_KEY]
-
-        for s_batch in s_batches:
-            payload = construct_segmentation_payload(s_batch, update)
-            response = self.make_request(payload, endpoint)
-            # pbar.update(1)
-            if STATUS_CODE_KEY in response:
-                errors.append(response)
-            else:
-                predictions_processed += response[PREDICTIONS_PROCESSED_KEY]
-                predictions_ignored += response[PREDICTIONS_IGNORED_KEY]
-
-        return {
-            MODEL_RUN_ID_KEY: model_run_id,
-            PREDICTIONS_PROCESSED_KEY: predictions_processed,
-            PREDICTIONS_IGNORED_KEY: predictions_ignored,
-            ERRORS_KEY: errors,
-        }
-
     @deprecated(
         "Model runs have been deprecated and will be removed. Use a Model instead."
     )
diff --git a/nucleus/annotation_uploader.py b/nucleus/annotation_uploader.py
@@ -1,5 +1,5 @@
 import json
-from typing import TYPE_CHECKING, Iterable, List, Sequence
+from typing import TYPE_CHECKING, Iterable, List, Optional, Sequence
 
 from nucleus.annotation import Annotation, SegmentationAnnotation
 from nucleus.async_utils import (
@@ -34,12 +34,14 @@ def accumulate_dict_values(dicts: Iterable[dict]):
 class AnnotationUploader:
     """This is a helper class not intended for direct use. Please use dataset.annotate.
 
-    This class is purely a helper class for implementing dataset.annotate.
+    This class is purely a helper class for implementing dataset.annotate/dataset.predict.
     """
 
-    def __init__(self, dataset_id: str, client: "NucleusClient"):  # noqa: F821
-        self.dataset_id = dataset_id
+    def __init__(
+        self, dataset_id: Optional[str], client: "NucleusClient"
+    ):  # noqa: F821
         self._client = client
+        self._route = f"dataset/{dataset_id}/annotate"
 
     def upload(
         self,
@@ -83,7 +85,7 @@ def upload(
             # segmentation will take a lot longer for the server to process than a single
             # annotation of any other kind.
             responses.extend(
-                self.make_batched_annotate_requests(
+                self.make_batched_requests(
                     segmentations_with_remote_files,
                     update,
                     batch_size=remote_files_per_upload_request,
@@ -92,7 +94,7 @@ def upload(
             )
         if annotations_without_files:
             responses.extend(
-                self.make_batched_annotate_requests(
+                self.make_batched_requests(
                     annotations_without_files,
                     update,
                     batch_size=batch_size,
@@ -102,7 +104,7 @@ def upload(
 
         return accumulate_dict_values(responses)
 
-    def make_batched_annotate_requests(
+    def make_batched_requests(
         self,
         annotations: Sequence[Annotation],
         update: bool,
@@ -120,9 +122,7 @@ def make_batched_annotate_requests(
         for batch in self._client.tqdm_bar(batches, desc=progress_bar_name):
             payload = construct_annotation_payload(batch, update)
             responses.append(
-                self._client.make_request(
-                    payload, route=f"dataset/{self.dataset_id}/annotate"
-                )
+                self._client.make_request(payload, route=self._route)
             )
         return responses
 
@@ -149,7 +149,7 @@ def make_batched_file_form_data_requests(
         return make_many_form_data_requests_concurrently(
             client=self._client,
             requests=requests,
-            route=f"dataset/{self.dataset_id}/annotate",
+            route=self._route,
             progressbar=progressbar,
             concurrency=local_file_upload_concurrency,
         )
@@ -202,3 +202,25 @@ def fn():
             return form_data, file_pointers
 
         return fn
+
+
+class PredictionUploader(AnnotationUploader):
+    def __init__(
+        self,
+        client: "NucleusClient",
+        dataset_id: Optional[str] = None,
+        model_id: Optional[str] = None,
+        model_run_id: Optional[str] = None,
+    ):
+        super().__init__(dataset_id, client)
+        self._client = client
+        if model_run_id is not None:
+            assert model_id is None and dataset_id is None
+            self._route = f"modelRun/{model_run_id}/predict"
+        else:
+            assert (
+                model_id is not None and dataset_id is not None
+            ), "Model ID and dataset ID are required if not using model run id."
+            self._route = (
+                f"dataset/{dataset_id}/model/{model_id}/uploadPredictions"
+            )
diff --git a/nucleus/dataset.py b/nucleus/dataset.py
@@ -3,7 +3,7 @@
 
 import requests
 
-from nucleus.annotation_uploader import AnnotationUploader
+from nucleus.annotation_uploader import AnnotationUploader, PredictionUploader
 from nucleus.job import AsyncJob
 from nucleus.prediction import (
     BoxPrediction,
@@ -347,6 +347,7 @@ def annotate(
                 request. Segmentations have either local or remote files, if you are
                 getting timeouts while uploading segmentations with local files, you
                 should lower this value from its default of 10. The maximum is 10.
+            local_file_upload_concurrency: Number of concurrent local file uploads.
 
 
         Returns:
@@ -1283,6 +1284,10 @@ def upload_predictions(
         ],
         update: bool = False,
         asynchronous: bool = False,
+        batch_size: int = 5000,
+        remote_files_per_upload_request: int = 20,
+        local_files_per_upload_request: int = 10,
+        local_file_upload_concurrency: int = 30,
     ):
         """Uploads predictions and associates them with an existing :class:`Model`.
 
@@ -1325,6 +1330,21 @@ def upload_predictions(
               collision. Default is False.
             asynchronous: Whether or not to process the upload asynchronously (and
               return an :class:`AsyncJob` object). Default is False.
+            batch_size: Number of predictions processed in each concurrent batch.
+              Default is 5000. If you get timeouts when uploading geometric predictions,
+              you can try lowering this batch size. This is only relevant for
+              asynchronous=False
+            remote_files_per_upload_request: Number of remote files to upload in each
+                request. Segmentations have either local or remote files, if you are
+                getting timeouts while uploading segmentations with remote urls, you
+                should lower this value from its default of 20. This is only relevant for
+                asynchronous=False.
+            local_files_per_upload_request: Number of local files to upload in each
+                request. Segmentations have either local or remote files, if you are
+                getting timeouts while uploading segmentations with local files, you
+                should lower this value from its default of 10. The maximum is 10.
+                This is only relevant for asynchronous=False
+            local_file_upload_concurrency: Number of concurrent local file uploads.
 
         Returns:
             Payload describing the synchronous upload::
@@ -1348,12 +1368,19 @@ def upload_predictions(
             )
             return AsyncJob.from_json(response, self._client)
         else:
-            return self._client.predict(
+            uploader = PredictionUploader(
                 model_run_id=None,
                 dataset_id=self.id,
                 model_id=model.id,
+                client=self._client,
+            )
+            return uploader.upload(
                 annotations=predictions,
+                batch_size=batch_size,
                 update=update,
+                remote_files_per_upload_request=remote_files_per_upload_request,
+                local_files_per_upload_request=local_files_per_upload_request,
+                local_file_upload_concurrency=local_file_upload_concurrency,
             )
 
     def predictions_iloc(self, model, index):
diff --git a/nucleus/model_run.py b/nucleus/model_run.py
@@ -18,6 +18,7 @@
 import requests
 
 from nucleus.annotation import check_all_mask_paths_remote
+from nucleus.annotation_uploader import PredictionUploader
 from nucleus.job import AsyncJob
 from nucleus.utils import (
     format_prediction_response,
@@ -114,12 +115,38 @@ def predict(
                 SegmentationPrediction,
             ]
         ],
-        update: Optional[bool] = DEFAULT_ANNOTATION_UPDATE_MODE,
+        update: bool = DEFAULT_ANNOTATION_UPDATE_MODE,
         asynchronous: bool = False,
+        batch_size: int = 5000,
+        remote_files_per_upload_request: int = 20,
+        local_files_per_upload_request: int = 10,
+        local_file_upload_concurrency: int = 30,
     ) -> Union[dict, AsyncJob]:
         """
         Uploads model outputs as predictions for a model_run. Returns info about the upload.
-        :param annotations: List[Union[BoxPrediction, PolygonPrediction, CuboidPrediction, SegmentationPrediction]],
+
+        Args:
+            annotations: Predictions to upload for this model run,
+            update: If True, existing predictions for the same (reference_id, annotation_id)
+                will be overwritten. If False, existing predictions will be skipped.
+            asynchronous: Whether or not to process the upload asynchronously (and
+                return an :class:`AsyncJob` object). Default is False.
+            batch_size: Number of predictions processed in each concurrent batch.
+                Default is 5000. If you get timeouts when uploading geometric annotations,
+                you can try lowering this batch size. This is only relevant for
+                asynchronous=False.
+            remote_files_per_upload_request: Number of remote files to upload in each
+                request. Segmentations have either local or remote files, if you are
+                getting timeouts while uploading segmentations with remote urls, you
+                should lower this value from its default of 20. This is only relevant for
+                asynchronous=False
+            local_files_per_upload_request: Number of local files to upload in each
+                request. Segmentations have either local or remote files, if you are
+                getting timeouts while uploading segmentations with local files, you
+                should lower this value from its default of 10. The maximum is 10.
+                This is only relevant for asynchronous=False
+            local_file_upload_concurrency: Number of concurrent local file uploads.
+                This is only relevant for asynchronous=False
         :return:
         {
             "model_run_id": str,
@@ -138,12 +165,17 @@ def predict(
                 route=f"modelRun/{self.model_run_id}/predict?async=1",
             )
             return AsyncJob.from_json(response, self._client)
-        else:
-            return self._client.predict(
-                model_run_id=self.model_run_id,
-                annotations=annotations,
-                update=update,
-            )
+        uploader = PredictionUploader(
+            model_run_id=self.model_run_id, client=self._client
+        )
+        return uploader.upload(
+            annotations=annotations,
+            update=update,
+            batch_size=batch_size,
+            remote_files_per_upload_request=remote_files_per_upload_request,
+            local_files_per_upload_request=local_files_per_upload_request,
+            local_file_upload_concurrency=local_file_upload_concurrency,
+        )
 
     def iloc(self, i: int):
         """