Half mode for Depth Anything and ZoeDepth k and nk

semjon00 · semjon00 · commit 987ee878155f · 2024-06-02T23:26:24.000+03:00
Reduces VRAM usage and may provide speedups.
Differences in the result should be imperceptible.
Breaks exact reproducibility, but it was never a priority.
diff --git a/dzoedepth/models/depth_model.py b/dzoedepth/models/depth_model.py
@@ -137,7 +137,8 @@ def infer_pil(self, pil_img, pad_input: bool=True, with_flip_aug: bool=True, out
             with_flip_aug (bool, optional): whether to use horizontal flip augmentation. Defaults to True.
             output_type (str, optional): output type. Supported values are 'numpy', 'pil' and 'tensor'. Defaults to "numpy".
         """
-        x = transforms.ToTensor()(pil_img).unsqueeze(0).to(self.device)
+        # dtype IS ADDED, NOT PRESENT IN THE MAINLINE
+        x = transforms.ToTensor()(pil_img).unsqueeze(0).to(device=self.device, dtype=next(self.parameters()).dtype)
         out_tensor = self.infer(x, pad_input=pad_input, with_flip_aug=with_flip_aug, **kwargs)
         if output_type == "numpy":
             return out_tensor.squeeze().cpu().numpy()
diff --git a/dzoedepth/models/layers/patch_transformer.py b/dzoedepth/models/layers/patch_transformer.py
@@ -86,6 +86,7 @@ def forward(self, x):
         # change to S,N,E format required by transformer
         embeddings = embeddings.permute(2, 0, 1)
         S, N, E = embeddings.shape
-        embeddings = embeddings + self.positional_encoding_1d(S, N, E, device=embeddings.device)
+        # dtype IS ADDED, NOT PRESENT IN THE MAINLINE
+        embeddings = embeddings + self.positional_encoding_1d(S, N, E, device=embeddings.device).to(dtype=embeddings.dtype)
         x = self.transformer_encoder(embeddings)  # .shape = S, N, E
         return x
diff --git a/scripts/depthmap_api.py b/scripts/depthmap_api.py
@@ -83,6 +83,7 @@ async def process_video(
             raise HTTPException(status_code=422, detail="No images supplied")
         print(f"Processing {str(len(depth_input_images))} images trough the API")
 
+        # You can use either these strings, or integers
         available_models = {
             'res101': 0,
             'dpt_beit_large_512': 1, #midas 3.1
@@ -94,6 +95,8 @@ async def process_video(
             'zoedepth_n': 7, #indoor
             'zoedepth_k': 8, #outdoor
             'zoedepth_nk': 9,
+            'marigold_v1': 10,
+            'depth_anything': 11
         }
         
         model_type = options["model_type"]
diff --git a/src/common_constants.py b/src/common_constants.py
@@ -39,7 +39,7 @@ def __init__(self, default_value=None, *args):
     STEREO_DIVERGENCE = 2.5
     STEREO_SEPARATION = 0.0
     STEREO_FILL_ALGO = "polylines_sharp"
-    STEREO_OFFSET_EXPONENT = 2.0
+    STEREO_OFFSET_EXPONENT = 1.0
     STEREO_BALANCE = 0.0
 
     GEN_NORMALMAP = False
diff --git a/src/depthmap_generation.py b/src/depthmap_generation.py
@@ -246,7 +246,9 @@ def flatten(el):
             if model_type in [0, 1, 2, 3, 4, 5, 6]:
                 model = model.to(memory_format=torch.channels_last)  # TODO: weird
             if not self.no_half:
-                if model_type in [1, 2, 3, 4, 5, 6] and not boost:  # TODO: zoedepth, Marigold and depth_anything, too?
+                # Marigold can be done
+                # TODO: Fix for zoedepth_n - it completely trips and generates black images
+                if model_type in [1, 2, 3, 4, 5, 6, 8, 9, 11] and not boost:
                     model = model.half()
         model.to(device)  # to correct device
 
@@ -484,7 +486,8 @@ def estimatedepthanything(image, model, w, h):
     )
 
     timage = transform({"image": image})["image"]
-    timage = torch.from_numpy(timage).unsqueeze(0).to(next(model.parameters()).device)
+    timage = torch.from_numpy(timage).unsqueeze(0).to(device=next(model.parameters()).device,
+                                                      dtype=next(model.parameters()).dtype)
 
     with torch.no_grad():
         depth = model(timage)