facebookresearch
diff --git a/‎projects/implicitron_trainer/experiment.py
Lines changed: 3 additions & 3 deletions b/‎projects/implicitron_trainer/experiment.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎projects/implicitron_trainer/visualize_reconstruction.py
Lines changed: 1 addition & 1 deletion b/‎projects/implicitron_trainer/visualize_reconstruction.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎pytorch3d/implicitron/eval_demo.py
Lines changed: 6 additions & 7 deletions b/‎pytorch3d/implicitron/eval_demo.py
Lines changed: 6 additions & 7 deletions
diff --git a/‎pytorch3d/implicitron/evaluation/evaluate_new_view_synthesis.py
Lines changed: 32 additions & 57 deletions b/‎pytorch3d/implicitron/evaluation/evaluate_new_view_synthesis.py
Lines changed: 32 additions & 57 deletions
diff --git a/‎pytorch3d/implicitron/models/base_model.py
Lines changed: 87 additions & 0 deletions b/‎pytorch3d/implicitron/models/base_model.py
Lines changed: 87 additions & 0 deletions
@@ -71,7 +71,7 @@
     ImplicitronDataset,
 )
 from pytorch3d.implicitron.evaluation import evaluate_new_view_synthesis as evaluate
-from pytorch3d.implicitron.models.base import EvaluationMode, GenericModel
+from pytorch3d.implicitron.models.generic_model import EvaluationMode, GenericModel
 from pytorch3d.implicitron.tools import model_io, vis_utils
 from pytorch3d.implicitron.tools.config import (
     enable_get_default_args,
@@ -615,11 +615,11 @@ def run_eval(cfg, model, all_source_cameras, loader, task, device):
             preds = model(
                 **{**frame_data_for_eval, "evaluation_mode": EvaluationMode.EVALUATION}
             )
-            nvs_prediction = copy.deepcopy(preds["nvs_prediction"])
+            implicitron_render = copy.deepcopy(preds["implicitron_render"])
             per_batch_eval_results.append(
                 evaluate.eval_batch(
                     frame_data,
-                    nvs_prediction,
+                    implicitron_render,
                     bg_color="black",
                     lpips_model=lpips_model,
                     source_cameras=all_source_cameras,
 
@@ -29,7 +29,7 @@
     ImplicitronDataset,
 )
 from pytorch3d.implicitron.dataset.utils import is_train_frame
-from pytorch3d.implicitron.models.base import EvaluationMode
+from pytorch3d.implicitron.models.base_model import EvaluationMode
 from pytorch3d.implicitron.tools.configurable import get_default_args
 from pytorch3d.implicitron.tools.eval_video_trajectory import (
     generate_eval_video_cameras,
 
@@ -5,10 +5,9 @@
 # LICENSE file in the root directory of this source tree.
 
 
-import copy
 import dataclasses
 import os
-from typing import cast, Optional
+from typing import cast, Optional, Tuple
 
 import lpips
 import torch
@@ -76,7 +75,7 @@ def main() -> None:
 
 def evaluate_dbir_for_category(
     category: str = "apple",
-    bg_color: float = 0.0,
+    bg_color: Tuple[float, float, float] = (0.0, 0.0, 0.0),
     task: str = "singlesequence",
     single_sequence_id: Optional[int] = None,
     num_workers: int = 16,
@@ -141,8 +140,9 @@ def evaluate_dbir_for_category(
         raise ValueError("Image size should be set in the dataset")
 
     # init the simple DBIR model
-    model = ModelDBIR(
-        image_size=image_size,
+    model = ModelDBIR(  # pyre-ignore[28]: c’tor implicitly overridden
+        render_image_width=image_size,
+        render_image_height=image_size,
         bg_color=bg_color,
         max_points=int(1e5),
     )
@@ -157,11 +157,10 @@ def evaluate_dbir_for_category(
     for frame_data in tqdm(test_dataloader):
         frame_data = dataclass_to_cuda_(frame_data)
         preds = model(**dataclasses.asdict(frame_data))
-        nvs_prediction = copy.deepcopy(preds["nvs_prediction"])
         per_batch_eval_results.append(
             eval_batch(
                 frame_data,
-                nvs_prediction,
+                preds["implicitron_render"],
                 bg_color=bg_color,
                 lpips_model=lpips_model,
                 source_cameras=all_source_cameras,
 
@@ -9,12 +9,14 @@
 import warnings
 from collections import OrderedDict
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Sequence, Union
 
 import numpy as np
 import torch
+import torch.nn.functional as F
 from pytorch3d.implicitron.dataset.implicitron_dataset import FrameData
 from pytorch3d.implicitron.dataset.utils import is_known_frame, is_train_frame
+from pytorch3d.implicitron.models.base_model import ImplicitronRender
 from pytorch3d.implicitron.tools import vis_utils
 from pytorch3d.implicitron.tools.camera_utils import volumetric_camera_overlaps
 from pytorch3d.implicitron.tools.image_utils import mask_background
@@ -31,18 +33,6 @@
 EVAL_N_SRC_VIEWS = [1, 3, 5, 7, 9]
 
 
-@dataclass
-class NewViewSynthesisPrediction:
-    """
-    Holds the tensors that describe a result of synthesizing new views.
-    """
-
-    depth_render: Optional[torch.Tensor] = None
-    image_render: Optional[torch.Tensor] = None
-    mask_render: Optional[torch.Tensor] = None
-    camera_distance: Optional[torch.Tensor] = None
-
-
 @dataclass
 class _Visualizer:
     image_render: torch.Tensor
@@ -145,8 +135,8 @@ def show_depth(
 
 def eval_batch(
     frame_data: FrameData,
-    nvs_prediction: NewViewSynthesisPrediction,
-    bg_color: Union[torch.Tensor, str, float] = "black",
+    implicitron_render: ImplicitronRender,
+    bg_color: Union[torch.Tensor, Sequence, str, float] = "black",
     mask_thr: float = 0.5,
     lpips_model=None,
     visualize: bool = False,
@@ -162,14 +152,14 @@ def eval_batch(
     is True), a new-view synthesis method (NVS) is tasked to generate new views
     of the scene from the viewpoint of the target views (for which
     frame_data.frame_type.endswith('known') is False). The resulting
-    synthesized new views, stored in `nvs_prediction`, are compared to the
+    synthesized new views, stored in `implicitron_render`, are compared to the
     target ground truth in `frame_data` in terms of geometry and appearance
     resulting in a dictionary of metrics returned by the `eval_batch` function.
 
     Args:
         frame_data: A FrameData object containing the input to the new view
             synthesis method.
-        nvs_prediction: The data describing the synthesized new views.
+        implicitron_render: The data describing the synthesized new views.
         bg_color: The background color of the generated new views and the
             ground truth.
         lpips_model: A pre-trained model for evaluating the LPIPS metric.
@@ -184,26 +174,39 @@ def eval_batch(
         ValueError if frame_data does not have frame_type, camera, or image_rgb
         ValueError if the batch has a mix of training and test samples
         ValueError if the batch frames are not [unseen, known, known, ...]
-        ValueError if one of the required fields in nvs_prediction is missing
+        ValueError if one of the required fields in implicitron_render is missing
     """
-    REQUIRED_NVS_PREDICTION_FIELDS = ["mask_render", "image_render", "depth_render"]
     frame_type = frame_data.frame_type
     if frame_type is None:
         raise ValueError("Frame type has not been set.")
 
     # we check that all those fields are not None but Pyre can't infer that properly
-    # TODO: assign to local variables
+    # TODO: assign to local variables and simplify the code.
     if frame_data.image_rgb is None:
         raise ValueError("Image is not in the evaluation batch.")
 
     if frame_data.camera is None:
         raise ValueError("Camera is not in the evaluation batch.")
 
-    if any(not hasattr(nvs_prediction, k) for k in REQUIRED_NVS_PREDICTION_FIELDS):
-        raise ValueError("One of the required predicted fields is missing")
+    # eval all results in the resolution of the frame_data image
+    image_resol = tuple(frame_data.image_rgb.shape[2:])
+
+    # Post-process the render:
+    # 1) check implicitron_render for Nones,
+    # 2) obtain copies to make sure we dont edit the original data,
+    # 3) take only the 1st (target) image
+    # 4) resize to match ground-truth resolution
+    cloned_render: Dict[str, torch.Tensor] = {}
+    for k in ["mask_render", "image_render", "depth_render"]:
+        field = getattr(implicitron_render, k)
+        if field is None:
+            raise ValueError(f"A required predicted field {k} is missing")
+
+        imode = "bilinear" if k == "image_render" else "nearest"
+        cloned_render[k] = (
+            F.interpolate(field[:1], size=image_resol, mode=imode).detach().clone()
+        )
 
-    # obtain copies to make sure we dont edit the original data
-    nvs_prediction = copy.deepcopy(nvs_prediction)
     frame_data = copy.deepcopy(frame_data)
 
     # mask the ground truth depth in case frame_data contains the depth mask
@@ -226,9 +229,6 @@ def eval_batch(
             + " a target view while the rest should be source views."
         )  # TODO: do we need to enforce this?
 
-    # take only the first (target image)
-    for k in REQUIRED_NVS_PREDICTION_FIELDS:
-        setattr(nvs_prediction, k, getattr(nvs_prediction, k)[:1])
     for k in [
         "depth_map",
         "image_rgb",
@@ -242,10 +242,6 @@ def eval_batch(
     if frame_data.depth_map is None or frame_data.depth_map.sum() <= 0:
         warnings.warn("Empty or missing depth map in evaluation!")
 
-    # eval all results in the resolution of the frame_data image
-    # pyre-fixme[16]: `Optional` has no attribute `shape`.
-    image_resol = list(frame_data.image_rgb.shape[2:])
-
     # threshold the masks to make ground truth binary masks
     mask_fg, mask_crop = [
         (getattr(frame_data, k) >= mask_thr) for k in ("fg_probability", "mask_crop")
@@ -258,29 +254,14 @@ def eval_batch(
         bg_color=bg_color,
     )
 
-    # resize to the target resolution
-    for k in REQUIRED_NVS_PREDICTION_FIELDS:
-        imode = "bilinear" if k == "image_render" else "nearest"
-        val = getattr(nvs_prediction, k)
-        setattr(
-            nvs_prediction,
-            k,
-            # pyre-fixme[6]: Expected `Optional[int]` for 2nd param but got
-            #  `List[typing.Any]`.
-            torch.nn.functional.interpolate(val, size=image_resol, mode=imode),
-        )
-
     # clamp predicted images
-    # pyre-fixme[16]: `Optional` has no attribute `clamp`.
-    image_render = nvs_prediction.image_render.clamp(0.0, 1.0)
+    image_render = cloned_render["image_render"].clamp(0.0, 1.0)
 
     if visualize:
         visualizer = _Visualizer(
             image_render=image_render,
             image_rgb_masked=image_rgb_masked,
-            # pyre-fixme[6]: Expected `Tensor` for 3rd param but got
-            #  `Optional[torch.Tensor]`.
-            depth_render=nvs_prediction.depth_render,
+            depth_render=cloned_render["depth_render"],
             # pyre-fixme[6]: Expected `Tensor` for 4th param but got
             #  `Optional[torch.Tensor]`.
             depth_map=frame_data.depth_map,
@@ -292,9 +273,7 @@ def eval_batch(
     results: Dict[str, Any] = {}
 
     results["iou"] = iou(
-        # pyre-fixme[6]: Expected `Tensor` for 1st param but got
-        #  `Optional[torch.Tensor]`.
-        nvs_prediction.mask_render,
+        cloned_render["mask_render"],
         mask_fg,
         mask=mask_crop,
     )
@@ -321,11 +300,7 @@ def eval_batch(
         if name_postfix == "_fg":
             # only record depth metrics for the foreground
             _, abs_ = eval_depth(
-                # pyre-fixme[6]: Expected `Tensor` for 1st param but got
-                #  `Optional[torch.Tensor]`.
-                nvs_prediction.depth_render,
-                # pyre-fixme[6]: Expected `Tensor` for 2nd param but got
-                #  `Optional[torch.Tensor]`.
+                cloned_render["depth_render"],
                 frame_data.depth_map,
                 get_best_scale=True,
                 mask=loss_mask_now,
@@ -343,7 +318,7 @@ def eval_batch(
     if lpips_model is not None:
         im1, im2 = [
             2.0 * im.clamp(0.0, 1.0) - 1.0
-            for im in (image_rgb_masked, nvs_prediction.image_render)
+            for im in (image_rgb_masked, cloned_render["image_render"])
         ]
         results["lpips"] = lpips_model.forward(im1, im2).item()
 
 
@@ -0,0 +1,87 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+import torch
+from pytorch3d.implicitron.tools.config import ReplaceableBase
+from pytorch3d.renderer.cameras import CamerasBase
+
+from .renderer.base import EvaluationMode
+
+
+@dataclass
+class ImplicitronRender:
+    """
+    Holds the tensors that describe a result of rendering.
+    """
+
+    depth_render: Optional[torch.Tensor] = None
+    image_render: Optional[torch.Tensor] = None
+    mask_render: Optional[torch.Tensor] = None
+    camera_distance: Optional[torch.Tensor] = None
+
+    def clone(self) -> "ImplicitronRender":
+        def safe_clone(t: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+            return t.detach().clone() if t is not None else None
+
+        return ImplicitronRender(
+            depth_render=safe_clone(self.depth_render),
+            image_render=safe_clone(self.image_render),
+            mask_render=safe_clone(self.mask_render),
+            camera_distance=safe_clone(self.camera_distance),
+        )
+
+
+class ImplicitronModelBase(ReplaceableBase):
+    """Replaceable abstract base for all image generation / rendering models.
+    `forward()` method produces a render with a depth map.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(
+        self,
+        *,  # force keyword-only arguments
+        image_rgb: Optional[torch.Tensor],
+        camera: CamerasBase,
+        fg_probability: Optional[torch.Tensor],
+        mask_crop: Optional[torch.Tensor],
+        depth_map: Optional[torch.Tensor],
+        sequence_name: Optional[List[str]],
+        evaluation_mode: EvaluationMode = EvaluationMode.EVALUATION,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """
+        Args:
+            image_rgb: A tensor of shape `(B, 3, H, W)` containing a batch of rgb images;
+                the first `min(B, n_train_target_views)` images are considered targets and
+                are used to supervise the renders; the rest corresponding to the source
+                viewpoints from which features will be extracted.
+            camera: An instance of CamerasBase containing a batch of `B` cameras corresponding
+                to the viewpoints of target images, from which the rays will be sampled,
+                and source images, which will be used for intersecting with target rays.
+            fg_probability: A tensor of shape `(B, 1, H, W)` containing a batch of
+                foreground masks.
+            mask_crop: A binary tensor of shape `(B, 1, H, W)` deonting valid
+                regions in the input images (i.e. regions that do not correspond
+                to, e.g., zero-padding). When the `RaySampler`'s sampling mode is set to
+                "mask_sample", rays  will be sampled in the non zero regions.
+            depth_map: A tensor of shape `(B, 1, H, W)` containing a batch of depth maps.
+            sequence_name: A list of `B` strings corresponding to the sequence names
+                from which images `image_rgb` were extracted. They are used to match
+                target frames with relevant source frames.
+            evaluation_mode: one of EvaluationMode.TRAINING or
+                EvaluationMode.EVALUATION which determines the settings used for
+                rendering.
+
+        Returns:
+            preds: A dictionary containing all outputs of the forward pass. All models should
+                output an instance of `ImplicitronRender` in `preds["implicitron_render"]`.
+        """
+        raise NotImplementedError()
Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@`
`29`	`29`	`ImplicitronDataset,`
`30`	`30`	`)`
`31`	`31`	`from pytorch3d.implicitron.dataset.utils import is_train_frame`
`32`		`-from pytorch3d.implicitron.models.base import EvaluationMode`
	`32`	`+from pytorch3d.implicitron.models.base_model import EvaluationMode`
`33`	`33`	`from pytorch3d.implicitron.tools.configurable import get_default_args`
`34`	`34`	`from pytorch3d.implicitron.tools.eval_video_trajectory import (`
`35`	`35`	`generate_eval_video_cameras,`