diff --git a/README.md b/README.md
index ccb66cc..1ed85b9 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,8 @@ video by [@graemeniedermayer](https://github.com/graemeniedermayer), more exampl
 images generated by [@semjon00](https://github.com/semjon00) from CC0 photos, more examples [here](https://github.com/thygate/stable-diffusion-webui-depthmap-script/pull/56#issuecomment-1367596463).
 
 ## Changelog
+* v0.4.1 standalone mode
+    * ability to run DepthMap without WebUI (Use main.py. Make sure all the dependencies are installed. The support is not feature-complete.)
 * v0.4.0 large code refactor
     * UI improvements
     * improved Batch from Directory, Clip and renormalize DepthMap
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..fc7cf68
--- /dev/null
+++ b/main.py
@@ -0,0 +1,39 @@
+# This launches DepthMap without the AUTOMATIC1111/stable-diffusion-webui
+# If DepthMap is installed as an extension,
+# you may want to change the working directory to the stable-diffusion-webui root.
+
+import argparse
+import os
+import pathlib
+import builtins
+
+import src.misc
+
+def maybe_chdir():
+    """Detects if DepthMap was installed as a stable-diffusion-webui script, but run without current directory set to
+    the stable-diffusion-webui root. Changes current directory if needed, to aviod clutter."""
+    try:
+        file_path = pathlib.Path(__file__)
+        path = file_path.parts
+        while len(path) > 0 and path[-1] != src.misc.REPOSITORY_NAME:
+            path = path[:-1]
+        if len(path) >= 2 and path[-1] == src.misc.REPOSITORY_NAME and path[-2] == "extensions":
+            path = path[:-2]
+        listdir = os.listdir(str(pathlib.Path(*path)))
+        if 'launch.py' in listdir and 'webui.py':
+            os.chdir(str(pathlib.Path(**path)))
+    except:
+        pass
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--listen", help="Create public link")
+    parser.add_argument("--no_chdir", help="Do not try to use the root of stable-diffusion-webui")
+    args = parser.parse_args()
+
+    print(f"{src.misc.SCRIPT_FULL_NAME} running in standalone mode!")
+    import src.common_ui
+    if not args.no_chdir:
+        maybe_chdir()
+    src.common_ui.on_ui_tabs().launch(share=args.listen)
diff --git a/scripts/depthmap.py b/scripts/depthmap.py
index a0e85a1..868add5 100644
--- a/scripts/depthmap.py
+++ b/scripts/depthmap.py
@@ -1,233 +1,13 @@
-import gradio as gr
 import traceback
+import gradio as gr
+from modules import shared
 import modules.scripts as scripts
-from modules import processing, images, shared
-from modules import script_callbacks
-from modules.call_queue import wrap_gradio_gpu_call
-from modules.processing import create_infotext
-from modules.shared import opts
-from modules.ui import plaintext_to_html
-from pathlib import Path
-from PIL import Image
 
+from src import backbone
+from src import common_ui
+from src.core import core_generation_funnel
 from src.gradio_args_transport import GradioComponentBundle
-from src.main import *
-from src.core import core_generation_funnel, unload_models, run_makevideo
-from src.depthmap_generation import ModelHolder
-
-
-# Ugly workaround to fix gradio tempfile issue
-def ensure_gradio_temp_directory():
-    try:
-        import tempfile
-        path = os.path.join(tempfile.gettempdir(), 'gradio')
-        if not (os.path.exists(path)):
-            os.mkdir(path)
-    except Exception as e:
-        traceback.print_exc()
-ensure_gradio_temp_directory()
-
-
-def main_ui_panel(is_depth_tab):
-    inp = GradioComponentBundle()
-    # TODO: Greater visual separation
-    with gr.Blocks():
-        with gr.Row():
-            inp += 'compute_device', gr.Radio(label="Compute on", choices=['GPU', 'CPU'], value='GPU')
-            # TODO: Should return value instead of index. Maybe Enum should be used?
-            inp += 'model_type', gr.Dropdown(label="Model",
-                                             choices=['res101', 'dpt_beit_large_512 (midas 3.1)',
-                                                      'dpt_beit_large_384 (midas 3.1)', 'dpt_large_384 (midas 3.0)',
-                                                      'dpt_hybrid_384 (midas 3.0)',
-                                                      'midas_v21', 'midas_v21_small',
-                                                      'zoedepth_n (indoor)', 'zoedepth_k (outdoor)', 'zoedepth_nk'],
-                                             value='res101',
-                                             type="index")
-        with gr.Group():
-            with gr.Row():
-                inp += 'boost', gr.Checkbox(label="BOOST (multi-resolution merging)", value=True)
-                with gr.Group(visible=False) as options_depend_on_boost:
-                    inp += 'match_size', gr.Checkbox(label="Match net size to input size", value=False)
-            with gr.Row(visible=False) as options_depend_on_match_size:
-                inp += 'net_width', gr.Slider(minimum=64, maximum=2048, step=64, label='Net width', value=448)
-                inp += 'net_height', gr.Slider(minimum=64, maximum=2048, step=64, label='Net height', value=448)
-
-        with gr.Group():
-            with gr.Row():
-                inp += "save_outputs", gr.Checkbox(label="Save Outputs", value=True)  # 50% of width
-                with gr.Group():  # 50% of width
-                    inp += "output_depth", gr.Checkbox(label="Output DepthMap", value=True)
-                    inp += "invert_depth", gr.Checkbox(label="Invert (black=near, white=far)", value=False)
-            with gr.Row() as options_depend_on_output_depth_1:
-                inp += "combine_output", gr.Checkbox(
-                    label="Combine input and depthmap into one image", value=False)
-                inp += "combine_output_axis", gr.Radio(label="Combine axis", choices=['Vertical', 'Horizontal'],
-                                                       value='Horizontal', type="index", visible=False)
-        with gr.Group():
-            with gr.Row():
-                inp += 'clipdepth', gr.Checkbox(label="Clip and renormalize DepthMap", value=False)
-            with gr.Row(visible=False) as clip_options_row_1:
-                inp += "clipthreshold_far", gr.Slider(minimum=0, maximum=1, step=0.001, label='Far clip', value=0)
-                inp += "clipthreshold_near", gr.Slider(minimum=0, maximum=1, step=0.001, label='Near clip', value=1)
-
-        with gr.Group():
-            with gr.Row():
-                inp += "show_heat", gr.Checkbox(label="Generate HeatMap", value=False)
-                # gr.Checkbox(label="Generate NormalMap", value=False)  # TODO: this is a fake door
-
-        with gr.Group():
-            with gr.Row():
-                inp += "gen_stereo", gr.Checkbox(label="Generate stereoscopic image(s)", value=False)
-            with gr.Group(visible=False) as stereo_options:
-                with gr.Row():
-                    with gr.Row():
-                        inp += "stereo_modes", gr.CheckboxGroup(
-                            ["left-right", "right-left", "top-bottom", "bottom-top", "red-cyan-anaglyph"],
-                            label="Output", value=["left-right", "red-cyan-anaglyph"])
-                with gr.Row():
-                    inp += "stereo_divergence", gr.Slider(minimum=0.05, maximum=10.005, step=0.01,
-                                                          label='Divergence (3D effect)',
-                                                          value=2.5)
-                    inp += "stereo_separation", gr.Slider(minimum=-5.0, maximum=5.0, step=0.01,
-                                                          label='Separation (moves images apart)',
-                                                          value=0.0)
-                with gr.Row():
-                    inp += "stereo_fill", gr.Dropdown(label="Gap fill technique",
-                                                      choices=['none', 'naive', 'naive_interpolating', 'polylines_soft',
-                                                               'polylines_sharp'], value='polylines_sharp',
-                                                      type="value")
-                    inp += "stereo_balance", gr.Slider(minimum=-1.0, maximum=1.0, step=0.05,
-                                                       label='Balance between eyes',
-                                                       value=0.0)
-
-        with gr.Group():
-            with gr.Row():
-                inp += "gen_mesh", gr.Checkbox(
-                    label="Generate simple 3D mesh", value=False, visible=True)
-            with gr.Group(visible=False) as mesh_options:
-                with gr.Row():
-                    gr.HTML(value="Generates fast, accurate only with ZoeDepth models and no boost, no custom maps")
-                with gr.Row():
-                    inp += "mesh_occlude", gr.Checkbox(label="Remove occluded edges", value=True, visible=True)
-                    inp += "mesh_spherical", gr.Checkbox(label="Equirectangular projection", value=False, visible=True)
-
-        if is_depth_tab:
-            with gr.Group():
-                with gr.Row():
-                    inp += "inpaint", gr.Checkbox(
-                        label="Generate 3D inpainted mesh", value=False)
-                with gr.Group(visible=False) as inpaint_options_row_0:
-                    gr.HTML("Generation is sloooow, required for generating videos")
-                    inp += "inpaint_vids", gr.Checkbox(
-                        label="Generate 4 demo videos with 3D inpainted mesh.", value=False)
-                    gr.HTML("More options for generating video can be found in the Generate video tab")
-
-        with gr.Group():
-            # TODO: it should be clear from the UI that the background removal does not use the model selected above
-            with gr.Row():
-                inp += "background_removal", gr.Checkbox(label="Remove background", value=False)
-            with gr.Row(visible=False) as bgrem_options_row_1:
-                inp += "save_background_removal_masks", gr.Checkbox(label="Save the foreground masks", value=False)
-                inp += "pre_depth_background_removal", gr.Checkbox(label="Pre-depth background removal", value=False)
-            with gr.Row(visible=False) as bgrem_options_row_2:
-                inp += "background_removal_model", gr.Dropdown(label="Rembg Model",
-                                                               choices=['u2net', 'u2netp', 'u2net_human_seg',
-                                                                        'silueta'],
-                                                               value='u2net', type="value")
-
-        with gr.Box():
-            gr.HTML(f"{SCRIPT_FULL_NAME}<br/>")
-            gr.HTML("Information, comment and share @ <a "
-                    "href='https://github.com/thygate/stable-diffusion-webui-depthmap-script'>"
-                    "https://github.com/thygate/stable-diffusion-webui-depthmap-script</a>")
-
-        inp += "gen_normal", gr.Checkbox(label="Generate Normalmap (hidden! api only)", value=False, visible=False)
-
-        def update_delault_net_size(model_type):
-            w, h = ModelHolder.get_default_net_size(model_type)
-            return inp['net_width'].update(value=w), inp['net_height'].update(value=h)
-        inp['model_type'].change(
-            fn=update_delault_net_size,
-            inputs=inp['model_type'],
-            outputs=[inp['net_width'], inp['net_height']]
-        )
-
-        inp['boost'].change(
-            fn=lambda a, b: (options_depend_on_boost.update(visible=not a),
-                             options_depend_on_match_size.update(visible=not a and not b)),
-            inputs=[inp['boost'], inp['match_size']],
-            outputs=[options_depend_on_boost, options_depend_on_match_size]
-        )
-        inp['match_size'].change(
-            fn=lambda a, b: options_depend_on_match_size.update(visible=not a and not b),
-            inputs=[inp['boost'], inp['match_size']],
-            outputs=[options_depend_on_match_size]
-        )
-
-        inp['output_depth'].change(
-            fn=lambda a: (inp['invert_depth'].update(visible=a), options_depend_on_output_depth_1.update(visible=a)),
-            inputs=[inp['output_depth']],
-            outputs=[inp['invert_depth'], options_depend_on_output_depth_1]
-        )
-
-        inp['combine_output'].change(
-            fn=lambda v: inp['combine_output_axis'].update(visible=v),
-            inputs=[inp['combine_output']],
-            outputs=[inp['combine_output_axis']]
-        )
-
-        inp['clipdepth'].change(
-            fn=lambda v: clip_options_row_1.update(visible=v),
-            inputs=[inp['clipdepth']],
-            outputs=[clip_options_row_1]
-        )
-        inp['clipthreshold_far'].change(
-            fn=lambda a, b: a if b < a else b,
-            inputs=[inp['clipthreshold_far'], inp['clipthreshold_near']],
-            outputs=[inp['clipthreshold_near']]
-        )
-        inp['clipthreshold_near'].change(
-            fn=lambda a, b: a if b > a else b,
-            inputs=[inp['clipthreshold_near'], inp['clipthreshold_far']],
-            outputs=[inp['clipthreshold_far']]
-        )
-
-        def stereo_options_visibility(v):
-            return stereo_options.update(visible=v)
-
-        inp['gen_stereo'].change(
-            fn=stereo_options_visibility,
-            inputs=[inp['gen_stereo']],
-            outputs=[stereo_options]
-        )
-
-        inp['gen_mesh'].change(
-            fn=lambda v: mesh_options.update(visible=v),
-            inputs=[inp['gen_mesh']],
-            outputs=[mesh_options]
-        )
-
-        def inpaint_options_visibility(v):
-            return inpaint_options_row_0.update(visible=v)
-
-        if is_depth_tab:
-            inp['inpaint'].change(
-                fn=inpaint_options_visibility,
-                inputs=[inp['inpaint']],
-                outputs=[inpaint_options_row_0]
-            )
-
-        def background_removal_options_visibility(v):
-            return bgrem_options_row_1.update(visible=v), \
-                bgrem_options_row_2.update(visible=v)
-
-        inp['background_removal'].change(
-            fn=background_removal_options_visibility,
-            inputs=[inp['background_removal']],
-            outputs=[bgrem_options_row_1, bgrem_options_row_2]
-        )
-
-    return inp
+from src.misc import *
 
 
 class Script(scripts.Script):
@@ -241,12 +21,15 @@ def ui(self, is_img2img):
         gr.HTML()  # Work around a Gradio bug
         with gr.Column(variant='panel'):
             gr.HTML()  # Work around a Gradio bug
-            ret = main_ui_panel(False)
+            ret = common_ui.main_ui_panel(False)
             ret += ret.enkey_tail()
         return ret.enkey_body()
 
     # run from script in txt2img or img2img
     def run(self, p, *inputs):
+        from modules import processing
+        from modules.processing import create_infotext
+
         inputs = GradioComponentBundle.enkey_to_dict(inputs)
 
         # sd process
@@ -256,15 +39,15 @@ def run(self, p, *inputs):
         inputimages = []
         for count in range(0, len(processed.images)):
             # skip first grid image
-            if count == 0 and len(processed.images) > 1 and opts.return_grid:
+            if count == 0 and len(processed.images) > 1 and shared.opts.return_grid:
                 continue
             inputimages.append(processed.images[count])
 
-        outputs, mesh_fi, meshsimple_fi = core_generation_funnel(p.outpath_samples, inputimages, None, None, inputs)
+        outputs, mesh_fi, meshsimple_fi = core_generation_funnel(p.outpath_samples, inputimages, None, None, inputs, backbone.gather_ops())
 
         for input_i, imgs in enumerate(outputs):
             # get generation parameters
-            if hasattr(processed, 'all_prompts') and opts.enable_pnginfo:
+            if hasattr(processed, 'all_prompts') and shared.opts.enable_pnginfo:
                 info = create_infotext(processed, processed.all_prompts, processed.all_seeds, processed.all_subseeds,
                                        "", 0, input_i)
             else:
@@ -273,12 +56,12 @@ def run(self, p, *inputs):
                 processed.images.append(image)
                 if inputs["save_outputs"]:
                     try:
-                        suffix = "" if image_type == "depth" else f"_{image_type}"
-                        images.save_image(image, path=p.outpath_samples, basename="", seed=processed.all_seeds[input_i],
-                                          prompt=processed.all_prompts[input_i], extension=opts.samples_format,
-                                          info=info,
-                                          p=processed,
-                                          suffix=suffix)
+                        suffix = "" if image_type == "depth" else f"{image_type}"
+                        backbone.save_image(image, path=p.outpath_samples, basename="", seed=processed.all_seeds[input_i],
+                                   prompt=processed.all_prompts[input_i], extension=shared.opts.samples_format,
+                                   info=info,
+                                   p=processed,
+                                   suffix=suffix)
                     except Exception as e:
                         if not ('image has wrong mode' in str(e) or 'I;16' in str(e)):
                             raise e
@@ -310,261 +93,6 @@ def on_ui_settings():
                                              section=section))
 
 
-def on_ui_tabs():
-    inp = GradioComponentBundle()
-    with gr.Blocks(analytics_enabled=False) as depthmap_interface:
-        with gr.Row().style(equal_height=False):
-            with gr.Column(variant='panel'):
-                inp += 'depthmap_mode', gr.HTML(visible=False, value='0')
-                with gr.Tabs():
-                    with gr.TabItem('Single Image') as depthmap_mode_0:
-                        with gr.Row():
-                            inp += gr.Image(label="Source", source="upload", interactive=True, type="pil",
-                                            elem_id="depthmap_input_image")
-                            with gr.Group(visible=False) as custom_depthmap_row_0:
-                                # TODO: depthmap generation settings should disappear when using this
-                                inp += gr.File(label="Custom DepthMap", file_count="single", interactive=True,
-                                               type="file", elem_id='custom_depthmap_img')
-                        inp += gr.Checkbox(elem_id="custom_depthmap", label="Use custom DepthMap", value=False)
-                    with gr.TabItem('Batch Process') as depthmap_mode_1:
-                        inp += gr.File(elem_id='image_batch', label="Batch Process", file_count="multiple",
-                                       interactive=True, type="file")
-                    with gr.TabItem('Batch from Directory') as depthmap_mode_2:
-                        inp += gr.Textbox(elem_id="depthmap_batch_input_dir", label="Input directory",
-                                          **shared.hide_dirs,
-                                          placeholder="A directory on the same machine where the server is running.")
-                        inp += gr.Textbox(elem_id="depthmap_batch_output_dir", label="Output directory",
-                                          **shared.hide_dirs,
-                                          placeholder="Leave blank to save images to the default path.")
-                        gr.HTML("Files in the output directory may be overwritten.")
-                        inp += gr.Checkbox(elem_id="depthmap_batch_reuse",
-                                           label="Skip generation and use (edited/custom) depthmaps "
-                                                 "in output directory when a file already exists.",
-                                           value=True)
-                submit = gr.Button('Generate', elem_id="depthmap_generate", variant='primary')
-                inp += main_ui_panel(True)  # Main panel is inserted here
-                unloadmodels = gr.Button('Unload models', elem_id="depthmap_unloadmodels")
-
-            with gr.Column(variant='panel'):
-                with gr.Tabs(elem_id="mode_depthmap_output"):
-                    with gr.TabItem('Depth Output'):
-                        with gr.Group():
-                            result_images = gr.Gallery(label='Output', show_label=False,
-                                                       elem_id=f"depthmap_gallery").style(grid=4)
-                        with gr.Column():
-                            html_info_x = gr.HTML()
-                            html_info = gr.HTML()
-
-                    with gr.TabItem('3D Mesh'):
-                        with gr.Group():
-                            result_depthmesh = gr.Model3D(label="3d Mesh", clear_color=[1.0, 1.0, 1.0, 1.0])
-                            with gr.Row():
-                                # loadmesh = gr.Button('Load')
-                                clearmesh = gr.Button('Clear')
-
-                    with gr.TabItem('Generate video'):
-                        # generate video
-                        with gr.Group():
-                            with gr.Row():
-                                gr.Markdown("Generate video from inpainted(!) mesh.")
-                            with gr.Row():
-                                depth_vid = gr.Video(interactive=False)
-                            with gr.Column():
-                                vid_html_info_x = gr.HTML()
-                                vid_html_info = gr.HTML()
-                                fn_mesh = gr.Textbox(label="Input Mesh (.ply | .obj)", **shared.hide_dirs,
-                                                     placeholder="A file on the same machine where "
-                                                                 "the server is running.")
-                            with gr.Row():
-                                vid_numframes = gr.Textbox(label="Number of frames", value="300")
-                                vid_fps = gr.Textbox(label="Framerate", value="40")
-                                vid_format = gr.Dropdown(label="Format", choices=['mp4', 'webm'], value='mp4',
-                                                         type="value", elem_id="video_format")
-                                vid_ssaa = gr.Dropdown(label="SSAA", choices=['1', '2', '3', '4'], value='3',
-                                                       type="value", elem_id="video_ssaa")
-                            with gr.Row():
-                                vid_traj = gr.Dropdown(label="Trajectory",
-                                                       choices=['straight-line', 'double-straight-line', 'circle'],
-                                                       value='double-straight-line', type="index",
-                                                       elem_id="video_trajectory")
-                                vid_shift = gr.Textbox(label="Translate: x, y, z", value="-0.015, 0.0, -0.05")
-                                vid_border = gr.Textbox(label="Crop: top, left, bottom, right",
-                                                        value="0.03, 0.03, 0.05, 0.03")
-                                vid_dolly = gr.Checkbox(label="Dolly", value=False, elem_classes="smalltxt")
-                            with gr.Row():
-                                submit_vid = gr.Button('Generate Video', elem_id="depthmap_generatevideo",
-                                                       variant='primary')
-
-        inp += inp.enkey_tail()
-
-        depthmap_mode_0.select(lambda: '0', None, inp['depthmap_mode'])
-        depthmap_mode_1.select(lambda: '1', None, inp['depthmap_mode'])
-        depthmap_mode_2.select(lambda: '2', None, inp['depthmap_mode'])
-
-        def custom_depthmap_visibility(v):
-            return custom_depthmap_row_0.update(visible=v)
-
-        inp['custom_depthmap'].change(
-            fn=custom_depthmap_visibility,
-            inputs=[inp['custom_depthmap']],
-            outputs=[custom_depthmap_row_0]
-        )
-
-        unloadmodels.click(
-            fn=unload_models,
-            inputs=[],
-            outputs=[]
-        )
-
-        clearmesh.click(
-            fn=lambda: None,
-            inputs=[],
-            outputs=[result_depthmesh]
-        )
-
-        submit.click(
-            fn=wrap_gradio_gpu_call(run_generate),
-            inputs=inp.enkey_body(),
-            outputs=[
-                result_images,
-                fn_mesh,
-                result_depthmesh,
-                html_info_x,
-                html_info
-            ]
-        )
-
-        submit_vid.click(
-            fn=wrap_gradio_gpu_call(run_makevideo),
-            inputs=[
-                fn_mesh,
-                vid_numframes,
-                vid_fps,
-                vid_traj,
-                vid_shift,
-                vid_border,
-                vid_dolly,
-                vid_format,
-                vid_ssaa
-            ],
-            outputs=[
-                depth_vid,
-                vid_html_info_x,
-                vid_html_info
-            ]
-        )
-
-    return (depthmap_interface, "Depth", "depthmap_interface"),
-
-
-# called from depth tab
-def run_generate(*inputs):
-    inputs = GradioComponentBundle.enkey_to_dict(inputs)
-    depthmap_mode = inputs['depthmap_mode']
-    depthmap_batch_input_dir = inputs['depthmap_batch_input_dir']
-    image_batch = inputs['image_batch']
-    depthmap_input_image = inputs['depthmap_input_image']
-    depthmap_batch_output_dir = inputs['depthmap_batch_output_dir']
-    depthmap_batch_reuse = inputs['depthmap_batch_reuse']
-    custom_depthmap = inputs['custom_depthmap']
-    custom_depthmap_img = inputs['custom_depthmap_img']
-
-    inputimages = []
-    # Allow supplying custom depthmaps
-    inputdepthmaps = []
-    # Also keep track of original file names
-    inputnames = []
-
-    if depthmap_mode == '2' and depthmap_batch_output_dir != '':
-        outpath = depthmap_batch_output_dir
-    else:
-        outpath = opts.outdir_samples or opts.outdir_extras_samples
-
-    if depthmap_mode == '0':  # Single image
-        if depthmap_input_image is None:
-            return [], None, None, "Please select an input image!", ""
-        inputimages.append(depthmap_input_image)
-        inputnames.append(None)
-        if custom_depthmap:
-            if custom_depthmap_img is None:
-                return [], None, None,\
-                    "Custom depthmap is not specified. Please either supply it or disable this option.", ""
-            inputdepthmaps.append(Image.open(os.path.abspath(custom_depthmap_img.name)))
-        else:
-            inputdepthmaps.append(None)
-    if depthmap_mode == '1':  # Batch Process
-        if image_batch is None:
-            return [], None, None, "Please select input images!", ""
-        for img in image_batch:
-            image = Image.open(os.path.abspath(img.name))
-            inputimages.append(image)
-            inputnames.append(os.path.splitext(img.orig_name)[0])
-    elif depthmap_mode == '2':  # Batch from Directory
-        assert not shared.cmd_opts.hide_ui_dir_config, '--hide-ui-dir-config option must be disabled'
-        if depthmap_batch_input_dir == '':
-            return [], None, None, "Please select an input directory.", ""
-        if depthmap_batch_input_dir == depthmap_batch_output_dir:
-            return [], None, None, "Please pick different directories for batch processing.", ""
-        image_list = shared.listfiles(depthmap_batch_input_dir)
-        for path in image_list:
-            try:
-                inputimages.append(Image.open(path))
-                inputnames.append(path)
-
-                custom_depthmap = None
-                if depthmap_batch_reuse:
-                    basename = Path(path).stem
-                    # Custom names are not used in samples directory
-                    if outpath != opts.outdir_extras_samples:
-                        # Possible filenames that the custom depthmaps may have
-                        name_candidates = [f'{basename}-0000.{opts.samples_format}',  # current format
-                                           f'{basename}.png',  # human-intuitive format
-                                           f'{Path(path).name}']  # human-intuitive format (worse)
-                        for fn_cand in name_candidates:
-                            path_cand = os.path.join(outpath, fn_cand)
-                            if os.path.isfile(path_cand):
-                                custom_depthmap = Image.open(os.path.abspath(path_cand))
-                                break
-                inputdepthmaps.append(custom_depthmap)
-            except Exception as e:
-                print(f'Failed to load {path}, ignoring. Exception: {str(e)}')
-        inputdepthmaps_n = len([1 for x in inputdepthmaps if x is not None])
-        print(f'{len(inputimages)} images will be processed, {inputdepthmaps_n} existing depthmaps will be reused')
-
-    outputs, mesh_fi, meshsimple_fi = core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inputs)
-    show_images = []
-
-    # Saving images
-    for input_i, imgs in enumerate(outputs):
-        basename = 'depthmap'
-        if depthmap_mode == '2' and inputnames[input_i] is not None and outpath != opts.outdir_extras_samples:
-            basename = Path(inputnames[input_i]).stem
-
-        for image_type, image in list(imgs.items()):
-            show_images += [image]
-            if inputs["save_outputs"]:
-                try:
-                    suffix = "" if image_type == "depth" else f"_{image_type}"
-                    images.save_image(image, path=outpath, basename=basename, seed=None,
-                                      prompt=None, extension=opts.samples_format, short_filename=True,
-                                      no_prompt=True, grid=False, pnginfo_section_name="extras",
-                                      suffix=suffix)
-                except Exception as e:
-                    if not ('image has wrong mode' in str(e) or 'I;16' in str(e)):
-                        raise e
-                    print('Catched exception: image has wrong mode!')
-                    traceback.print_exc()
-
-    # use inpainted 3d mesh to show in 3d model output when enabled in settings
-    if hasattr(opts, 'depthmap_script_show_3d_inpaint') and opts.depthmap_script_show_3d_inpaint \
-            and mesh_fi is not None and len(mesh_fi) > 0:
-        meshsimple_fi = mesh_fi
-    # however, don't show 3dmodel when disabled in settings
-    if hasattr(opts, 'depthmap_script_show_3d') and not opts.depthmap_script_show_3d:
-        meshsimple_fi = None
-    # TODO: return more info
-    return show_images, mesh_fi, meshsimple_fi, plaintext_to_html('info'), ''
-
-
+from modules import script_callbacks
 script_callbacks.on_ui_settings(on_ui_settings)
-script_callbacks.on_ui_tabs(on_ui_tabs)
+script_callbacks.on_ui_tabs(lambda: [(common_ui.on_ui_tabs(), "Depth", "depthmap_interface")])
diff --git a/src/backbone.py b/src/backbone.py
new file mode 100644
index 0000000..0829ce0
--- /dev/null
+++ b/src/backbone.py
@@ -0,0 +1,118 @@
+# DepthMap can be run inside stable-duiffusion-webui, but also separately.
+# All the stable-duiffusion-webui stuff that the DepthMap relies on
+# must be resided in this file (or in the scripts folder).
+import pathlib
+from datetime import datetime
+
+try:
+    # stable-duiffusion-webui backbone
+    from modules.images import save_image  # Should fail if not on stable-duiffusion-webui
+    from modules.devices import torch_gc  # TODO: is this really sufficient?
+    from modules.images import get_next_sequence_number
+    from modules.call_queue import wrap_gradio_gpu_call
+    from modules.shared import listfiles
+
+    def get_opt(name, default):
+        from modules.shared import opts
+        if hasattr(opts, name):
+            return opts.__getattr__(name)
+        return default
+
+    def get_cmd_opt(name, default):
+        """Get command line argument"""
+        from modules.shared import cmd_opts
+        if hasattr(cmd_opts, name):
+            return cmd_opts.__getattribute__(name)
+        return default
+
+    def gather_ops():
+        """Parameters for depthmap generation"""
+        from modules.shared import cmd_opts
+        ops = {}
+        if get_opt('depthmap_script_boost_rmax', None) is not None:
+            ops['boost_whole_size_threshold'] = get_opt('depthmap_script_boost_rmax', None)
+        ops['precision'] = cmd_opts.precision
+        ops['no_half'] = cmd_opts.no_half
+        return ops
+
+
+    def get_outpath():
+        """Get path where results are saved by default"""
+        path = get_opt('outdir_samples', None)
+        if path is None or len(path) == 0:
+            path = get_opt('outdir_extras_samples', None)
+        assert path is not None and len(path) > 0
+        return path
+
+
+    def unload_sd_model():
+        from modules import shared, devices
+        if shared.sd_model is not None:
+            shared.sd_model.cond_stage_model.to(devices.cpu)
+            shared.sd_model.first_stage_model.to(devices.cpu)
+
+
+    def reload_sd_model():
+        from modules import shared, devices
+        if shared.sd_model is not None:
+            shared.sd_model.cond_stage_model.to(devices.device)
+            shared.sd_model.first_stage_model.to(devices.device)
+
+    def get_hide_dirs():
+        import modules.shared
+        return modules.shared.hide_dirs
+except:
+    # Standalone backbone
+    print(  # "  DepthMap did not detect stable-duiffusion-webui; launching with the standalone backbone.\n"
+          "  The standalone mode is not on par with the stable-duiffusion-webui mode.\n"
+          "  Some features may be missing or work differently. Please report bugs.\n")
+
+    def save_image(image, path, basename, **kwargs):
+        import os
+        os.makedirs(path, exist_ok=True)
+        if 'suffix' not in kwargs or len(kwargs['suffix']) == 0:
+            kwargs['suffix'] = ''
+        else:
+            kwargs['suffix'] = f"-{kwargs['suffix']}"
+        format = get_opt('samples_format', kwargs['extension'])
+        fullfn = os.path.join(
+            path, f"{basename}-{get_next_sequence_number(path, basename)}{kwargs['suffix']}.{format}")
+        image.save(fullfn, format=format)
+
+    def torch_gc():
+        # TODO: is this really sufficient?
+        import torch
+        if torch.cuda.is_available():
+            with torch.cuda.device('cuda'):
+                torch.cuda.empty_cache()
+                torch.cuda.ipc_collect()
+
+    launched_at = int(datetime.now().timestamp())
+    backbone_current_seq_number = 0
+
+    def get_next_sequence_number(outpath=None, basename=None):
+        global backbone_current_seq_number
+        backbone_current_seq_number += 1
+        return int(f"{launched_at}{backbone_current_seq_number:04}")
+
+    def wrap_gradio_gpu_call(f): return f  # Displaying various stats is not supported
+
+    def listfiles(dirname):
+        import os
+        filenames = [os.path.join(dirname, x) for x in sorted(os.listdir(dirname)) if not x.startswith(".")]
+        return [file for file in filenames if os.path.isfile(file)]
+
+    def get_opt(name, default): return default  # Configuring is not supported
+
+
+    def get_cmd_opt(name, default): return default  # Configuring is not supported
+
+    def gather_ops(): return {}  # Configuring is not supported
+
+    def get_outpath(): return str(pathlib.Path('.', 'outputs'))
+
+    def unload_sd_model(): pass  # Not needed
+
+    def reload_sd_model(): pass  # Not needed
+
+    def get_hide_dirs(): return {}  # Directories will not be hidden from traversal (except when starts with the dot)
diff --git a/src/common_ui.py b/src/common_ui.py
new file mode 100644
index 0000000..68583d3
--- /dev/null
+++ b/src/common_ui.py
@@ -0,0 +1,491 @@
+import traceback
+from pathlib import Path
+import gradio as gr
+from PIL import Image
+
+from src import backbone
+from src.core import core_generation_funnel, unload_models, run_makevideo
+from src.depthmap_generation import ModelHolder
+from src.gradio_args_transport import GradioComponentBundle
+from src.misc import *
+
+
+# Ugly workaround to fix gradio tempfile issue
+def ensure_gradio_temp_directory():
+    try:
+        import tempfile
+        path = os.path.join(tempfile.gettempdir(), 'gradio')
+        if not (os.path.exists(path)):
+            os.mkdir(path)
+    except Exception as e:
+        traceback.print_exc()
+
+
+ensure_gradio_temp_directory()
+
+
+def main_ui_panel(is_depth_tab):
+    inp = GradioComponentBundle()
+    # TODO: Greater visual separation
+    with gr.Blocks():
+        with gr.Row():
+            inp += 'compute_device', gr.Radio(label="Compute on", choices=['GPU', 'CPU'], value='GPU')
+            # TODO: Should return value instead of index. Maybe Enum should be used?
+            inp += 'model_type', gr.Dropdown(label="Model",
+                                             choices=['res101', 'dpt_beit_large_512 (midas 3.1)',
+                                                      'dpt_beit_large_384 (midas 3.1)', 'dpt_large_384 (midas 3.0)',
+                                                      'dpt_hybrid_384 (midas 3.0)',
+                                                      'midas_v21', 'midas_v21_small',
+                                                      'zoedepth_n (indoor)', 'zoedepth_k (outdoor)', 'zoedepth_nk'],
+                                             value='res101',
+                                             type="index")
+        with gr.Box():
+            with gr.Row():
+                inp += 'boost', gr.Checkbox(label="BOOST (multi-resolution merging)", value=True)
+                inp += 'match_size', gr.Checkbox(label="Match net size to input size", value=False)
+            with gr.Row(visible=False) as options_depend_on_match_size:
+                inp += 'net_width', gr.Slider(minimum=64, maximum=2048, step=64, label='Net width', value=448)
+                inp += 'net_height', gr.Slider(minimum=64, maximum=2048, step=64, label='Net height', value=448)
+
+        with gr.Box():
+            with gr.Row():
+                with gr.Group():
+                    inp += "save_outputs", gr.Checkbox(label="Save Outputs", value=True)  # 50% of width
+                with gr.Group():  # 50% of width
+                    inp += "output_depth", gr.Checkbox(label="Output DepthMap", value=True)
+                    inp += "invert_depth", gr.Checkbox(label="Invert (black=near, white=far)", value=False)
+            with gr.Row() as options_depend_on_output_depth_1:
+                inp += "combine_output", gr.Checkbox(
+                    label="Combine input and depthmap into one image", value=False)
+                inp += "combine_output_axis", gr.Radio(label="Combine axis", choices=['Vertical', 'Horizontal'],
+                                                       value='Horizontal', type="index", visible=False)
+        with gr.Box():
+            with gr.Row():
+                inp += 'clipdepth', gr.Checkbox(label="Clip and renormalize DepthMap", value=False)
+            with gr.Row(visible=False) as clip_options_row_1:
+                inp += "clipthreshold_far", gr.Slider(minimum=0, maximum=1, step=0.001, label='Far clip', value=0)
+                inp += "clipthreshold_near", gr.Slider(minimum=0, maximum=1, step=0.001, label='Near clip', value=1)
+
+        with gr.Box():
+            with gr.Row():
+                inp += "show_heat", gr.Checkbox(label="Generate HeatMap", value=False)
+                # gr.Checkbox(label="Generate NormalMap", value=False)  # TODO: this is a fake door
+
+        with gr.Box():
+            with gr.Row():
+                inp += "gen_stereo", gr.Checkbox(label="Generate stereoscopic image(s)", value=False)
+            with gr.Column(visible=False) as stereo_options:
+                with gr.Row():
+                    inp += "stereo_modes", gr.CheckboxGroup(
+                        ["left-right", "right-left", "top-bottom", "bottom-top", "red-cyan-anaglyph"],
+                        label="Output", value=["left-right", "red-cyan-anaglyph"])
+                with gr.Row():
+                    inp += "stereo_divergence", gr.Slider(minimum=0.05, maximum=10.005, step=0.01,
+                                                          label='Divergence (3D effect)',
+                                                          value=2.5)
+                    inp += "stereo_separation", gr.Slider(minimum=-5.0, maximum=5.0, step=0.01,
+                                                          label='Separation (moves images apart)',
+                                                          value=0.0)
+                with gr.Row():
+                    inp += "stereo_fill", gr.Dropdown(label="Gap fill technique",
+                                                      choices=['none', 'naive', 'naive_interpolating', 'polylines_soft',
+                                                               'polylines_sharp'], value='polylines_sharp',
+                                                      type="value")
+                    inp += "stereo_balance", gr.Slider(minimum=-1.0, maximum=1.0, step=0.05,
+                                                       label='Balance between eyes',
+                                                       value=0.0)
+
+        with gr.Box():
+            with gr.Column():
+                inp += "gen_mesh", gr.Checkbox(
+                    label="Generate simple 3D mesh", value=False, visible=True)
+            with gr.Column(visible=False) as mesh_options:
+                with gr.Row():
+                    gr.HTML(value="Generates fast, accurate only with ZoeDepth models and no boost, no custom maps")
+                with gr.Row():
+                    inp += "mesh_occlude", gr.Checkbox(label="Remove occluded edges", value=True, visible=True)
+                    inp += "mesh_spherical", gr.Checkbox(label="Equirectangular projection", value=False, visible=True)
+
+        if is_depth_tab:
+            with gr.Box():
+                with gr.Column():
+                    inp += "inpaint", gr.Checkbox(
+                        label="Generate 3D inpainted mesh", value=False)
+                with gr.Column(visible=False) as inpaint_options_row_0:
+                    gr.HTML("Generation is sloooow, required for generating videos")
+                    inp += "inpaint_vids", gr.Checkbox(
+                        label="Generate 4 demo videos with 3D inpainted mesh.", value=False)
+                    gr.HTML("More options for generating video can be found in the Generate video tab")
+
+        with gr.Box():
+            # TODO: it should be clear from the UI that there is an option of the background removal
+            #  that does not use the model selected above
+            with gr.Row():
+                inp += "background_removal", gr.Checkbox(label="Remove background", value=False)
+            with gr.Column(visible=False) as bgrem_options:
+                with gr.Row():
+                    inp += "save_background_removal_masks", gr.Checkbox(label="Save the foreground masks", value=False)
+                    inp += "pre_depth_background_removal", gr.Checkbox(label="Pre-depth background removal", value=False)
+                with gr.Row():
+                    inp += "background_removal_model", gr.Dropdown(label="Rembg Model",
+                                                                   choices=['u2net', 'u2netp', 'u2net_human_seg',
+                                                                            'silueta'],
+                                                                   value='u2net', type="value")
+
+        with gr.Box():
+            gr.HTML(f"{SCRIPT_FULL_NAME}<br/>")
+            gr.HTML("Information, comment and share @ <a "
+                    "href='https://github.com/thygate/stable-diffusion-webui-depthmap-script'>"
+                    "https://github.com/thygate/stable-diffusion-webui-depthmap-script</a>")
+
+        inp += "gen_normal", gr.Checkbox(label="Generate Normalmap (hidden! api only)", value=False, visible=False)
+
+        def update_delault_net_size(model_type):
+            w, h = ModelHolder.get_default_net_size(model_type)
+            return inp['net_width'].update(value=w), inp['net_height'].update(value=h)
+
+        inp['model_type'].change(
+            fn=update_delault_net_size,
+            inputs=inp['model_type'],
+            outputs=[inp['net_width'], inp['net_height']]
+        )
+
+        inp['boost'].change(
+            fn=lambda a, b: (inp['match_size'].update(visible=not a),
+                             options_depend_on_match_size.update(visible=not a and not b)),
+            inputs=[inp['boost'], inp['match_size']],
+            outputs=[inp['match_size'], options_depend_on_match_size]
+        )
+        inp['match_size'].change(
+            fn=lambda a, b: options_depend_on_match_size.update(visible=not a and not b),
+            inputs=[inp['boost'], inp['match_size']],
+            outputs=[options_depend_on_match_size]
+        )
+
+        inp['output_depth'].change(
+            fn=lambda a: (inp['invert_depth'].update(visible=a), options_depend_on_output_depth_1.update(visible=a)),
+            inputs=[inp['output_depth']],
+            outputs=[inp['invert_depth'], options_depend_on_output_depth_1]
+        )
+
+        inp['combine_output'].change(
+            fn=lambda v: inp['combine_output_axis'].update(visible=v),
+            inputs=[inp['combine_output']],
+            outputs=[inp['combine_output_axis']]
+        )
+
+        inp['clipdepth'].change(
+            fn=lambda v: clip_options_row_1.update(visible=v),
+            inputs=[inp['clipdepth']],
+            outputs=[clip_options_row_1]
+        )
+        inp['clipthreshold_far'].change(
+            fn=lambda a, b: a if b < a else b,
+            inputs=[inp['clipthreshold_far'], inp['clipthreshold_near']],
+            outputs=[inp['clipthreshold_near']]
+        )
+        inp['clipthreshold_near'].change(
+            fn=lambda a, b: a if b > a else b,
+            inputs=[inp['clipthreshold_near'], inp['clipthreshold_far']],
+            outputs=[inp['clipthreshold_far']]
+        )
+
+        inp['gen_stereo'].change(
+            fn=lambda v: stereo_options.update(visible=v),
+            inputs=[inp['gen_stereo']],
+            outputs=[stereo_options]
+        )
+
+        inp['gen_mesh'].change(
+            fn=lambda v: mesh_options.update(visible=v),
+            inputs=[inp['gen_mesh']],
+            outputs=[mesh_options]
+        )
+
+        if is_depth_tab:
+            inp['inpaint'].change(
+                fn=lambda v: inpaint_options_row_0.update(visible=v),
+                inputs=[inp['inpaint']],
+                outputs=[inpaint_options_row_0]
+            )
+
+        inp['background_removal'].change(
+            fn=lambda v: bgrem_options.update(visible=v),
+            inputs=[inp['background_removal']],
+            outputs=[bgrem_options]
+        )
+
+    return inp
+
+def open_folder_action():
+    # Adapted from stable-diffusion-webui
+    f = backbone.get_outpath()
+    if backbone.get_cmd_opt('hide_ui_dir_config', False):
+        return
+    if not os.path.exists(f) or not os.path.isdir(f):
+        raise "Couldn't open output folder"  # .isdir is security-related, do not remove!
+    import platform
+    import subprocess as sp
+    path = os.path.normpath(f)
+    if platform.system() == "Windows":
+        os.startfile(path)
+    elif platform.system() == "Darwin":
+        sp.Popen(["open", path])
+    elif "microsoft-standard-WSL2" in platform.uname().release:
+        sp.Popen(["wsl-open", path])
+    else:
+        sp.Popen(["xdg-open", path])
+
+def on_ui_tabs():
+    inp = GradioComponentBundle()
+    with gr.Blocks(analytics_enabled=False, title="DepthMap") as depthmap_interface:
+        with gr.Row().style(equal_height=False):
+            with gr.Column(variant='panel'):
+                inp += 'depthmap_mode', gr.HTML(visible=False, value='0')
+                with gr.Tabs():
+                    with gr.TabItem('Single Image') as depthmap_mode_0:
+                        with gr.Group():
+                            with gr.Row():
+                                inp += gr.Image(label="Source", source="upload", interactive=True, type="pil",
+                                                elem_id="depthmap_input_image")
+                                # TODO: depthmap generation settings should disappear when using this
+                                inp += gr.File(label="Custom DepthMap", file_count="single", interactive=True,
+                                               type="file", elem_id='custom_depthmap_img', visible=False)
+                        inp += gr.Checkbox(elem_id="custom_depthmap", label="Use custom DepthMap", value=False)
+                    with gr.TabItem('Batch Process') as depthmap_mode_1:
+                        inp += gr.File(elem_id='image_batch', label="Batch Process", file_count="multiple",
+                                       interactive=True, type="file")
+                    with gr.TabItem('Batch from Directory') as depthmap_mode_2:
+                        inp += gr.Textbox(elem_id="depthmap_batch_input_dir", label="Input directory",
+                                          **backbone.get_hide_dirs(),
+                                          placeholder="A directory on the same machine where the server is running.")
+                        inp += gr.Textbox(elem_id="depthmap_batch_output_dir", label="Output directory",
+                                          **backbone.get_hide_dirs(),
+                                          placeholder="Leave blank to save images to the default path.")
+                        gr.HTML("Files in the output directory may be overwritten.")
+                        inp += gr.Checkbox(elem_id="depthmap_batch_reuse",
+                                           label="Skip generation and use (edited/custom) depthmaps "
+                                                 "in output directory when a file already exists.",
+                                           value=True)
+                submit = gr.Button('Generate', elem_id="depthmap_generate", variant='primary')
+                inp += main_ui_panel(True)  # Main panel is inserted here
+                unloadmodels = gr.Button('Unload models', elem_id="depthmap_unloadmodels")
+
+            with gr.Column(variant='panel'):
+                with gr.Tabs(elem_id="mode_depthmap_output"):
+                    with gr.TabItem('Depth Output'):
+                        with gr.Group():
+                            result_images = gr.Gallery(label='Output', show_label=False,
+                                                       elem_id=f"depthmap_gallery").style(grid=4)
+                        with gr.Column():
+                            html_info = gr.HTML()
+                        folder_symbol = '\U0001f4c2'  # 📂
+                        gr.Button(folder_symbol, visible=not backbone.get_cmd_opt('hide_ui_dir_config', False)).click(
+                            fn=lambda: open_folder_action(), inputs=[], outputs=[],
+                        )
+
+                    with gr.TabItem('3D Mesh'):
+                        with gr.Group():
+                            result_depthmesh = gr.Model3D(label="3d Mesh", clear_color=[1.0, 1.0, 1.0, 1.0])
+                            with gr.Row():
+                                # loadmesh = gr.Button('Load')
+                                clearmesh = gr.Button('Clear')
+
+                    with gr.TabItem('Generate video'):
+                        # generate video
+                        with gr.Group():
+                            with gr.Row():
+                                gr.Markdown("Generate video from inpainted(!) mesh.")
+                            with gr.Row():
+                                depth_vid = gr.Video(interactive=False)
+                            with gr.Column():
+                                vid_html_info_x = gr.HTML()
+                                vid_html_info = gr.HTML()
+                                fn_mesh = gr.Textbox(label="Input Mesh (.ply | .obj)", **backbone.get_hide_dirs(),
+                                                     placeholder="A file on the same machine where "
+                                                                 "the server is running.")
+                            with gr.Row():
+                                vid_numframes = gr.Textbox(label="Number of frames", value="300")
+                                vid_fps = gr.Textbox(label="Framerate", value="40")
+                                vid_format = gr.Dropdown(label="Format", choices=['mp4', 'webm'], value='mp4',
+                                                         type="value", elem_id="video_format")
+                                vid_ssaa = gr.Dropdown(label="SSAA", choices=['1', '2', '3', '4'], value='3',
+                                                       type="value", elem_id="video_ssaa")
+                            with gr.Row():
+                                vid_traj = gr.Dropdown(label="Trajectory",
+                                                       choices=['straight-line', 'double-straight-line', 'circle'],
+                                                       value='double-straight-line', type="index",
+                                                       elem_id="video_trajectory")
+                                vid_shift = gr.Textbox(label="Translate: x, y, z", value="-0.015, 0.0, -0.05")
+                                vid_border = gr.Textbox(label="Crop: top, left, bottom, right",
+                                                        value="0.03, 0.03, 0.05, 0.03")
+                                vid_dolly = gr.Checkbox(label="Dolly", value=False, elem_classes="smalltxt")
+                            with gr.Row():
+                                submit_vid = gr.Button('Generate Video', elem_id="depthmap_generatevideo",
+                                                       variant='primary')
+
+
+        inp += inp.enkey_tail()
+
+        depthmap_mode_0.select(lambda: '0', None, inp['depthmap_mode'])
+        depthmap_mode_1.select(lambda: '1', None, inp['depthmap_mode'])
+        depthmap_mode_2.select(lambda: '2', None, inp['depthmap_mode'])
+
+        inp['custom_depthmap'].change(
+            fn=lambda v: inp['custom_depthmap_img'].update(visible=v),
+            inputs=[inp['custom_depthmap']],
+            outputs=[inp['custom_depthmap_img']]
+        )
+
+        unloadmodels.click(
+            fn=unload_models,
+            inputs=[],
+            outputs=[]
+        )
+
+        clearmesh.click(
+            fn=lambda: None,
+            inputs=[],
+            outputs=[result_depthmesh]
+        )
+
+        submit.click(
+            fn=backbone.wrap_gradio_gpu_call(run_generate),
+            inputs=inp.enkey_body(),
+            outputs=[
+                result_images,
+                fn_mesh,
+                result_depthmesh,
+                html_info
+            ]
+        )
+
+        submit_vid.click(
+            fn=backbone.wrap_gradio_gpu_call(run_makevideo),
+            inputs=[
+                fn_mesh,
+                vid_numframes,
+                vid_fps,
+                vid_traj,
+                vid_shift,
+                vid_border,
+                vid_dolly,
+                vid_format,
+                vid_ssaa
+            ],
+            outputs=[
+                depth_vid,
+                vid_html_info_x,
+                vid_html_info
+            ]
+        )
+
+    return depthmap_interface
+
+
+def run_generate(*inputs):
+    inputs = GradioComponentBundle.enkey_to_dict(inputs)
+    depthmap_mode = inputs['depthmap_mode']
+    depthmap_batch_input_dir = inputs['depthmap_batch_input_dir']
+    image_batch = inputs['image_batch']
+    depthmap_input_image = inputs['depthmap_input_image']
+    depthmap_batch_output_dir = inputs['depthmap_batch_output_dir']
+    depthmap_batch_reuse = inputs['depthmap_batch_reuse']
+    custom_depthmap = inputs['custom_depthmap']
+    custom_depthmap_img = inputs['custom_depthmap_img']
+
+    inputimages = []
+    # Allow supplying custom depthmaps
+    inputdepthmaps = []
+    # Also keep track of original file names
+    inputnames = []
+
+    if depthmap_mode == '2' and depthmap_batch_output_dir != '':
+        outpath = depthmap_batch_output_dir
+    else:
+        outpath = backbone.get_outpath()
+
+    if depthmap_mode == '0':  # Single image
+        if depthmap_input_image is None:
+            return [], None, None, "Please select an input image"
+        inputimages.append(depthmap_input_image)
+        inputnames.append(None)
+        if custom_depthmap:
+            if custom_depthmap_img is None:
+                return [], None, None, \
+                    "Custom depthmap is not specified. Please either supply it or disable this option."
+            inputdepthmaps.append(Image.open(os.path.abspath(custom_depthmap_img.name)))
+        else:
+            inputdepthmaps.append(None)
+    if depthmap_mode == '1':  # Batch Process
+        if image_batch is None:
+            return [], None, None, "Please select input images", ""
+        for img in image_batch:
+            image = Image.open(os.path.abspath(img.name))
+            inputimages.append(image)
+            inputnames.append(os.path.splitext(img.orig_name)[0])
+    elif depthmap_mode == '2':  # Batch from Directory
+        assert not backbone.get_cmd_opt('hide_ui_dir_config', False), '--hide-ui-dir-config option must be disabled'
+        if depthmap_batch_input_dir == '':
+            return [], None, None, "Please select an input directory."
+        if depthmap_batch_input_dir == depthmap_batch_output_dir:
+            return [], None, None, "Please pick different directories for batch processing."
+        image_list = backbone.listfiles(depthmap_batch_input_dir)
+        for path in image_list:
+            try:
+                inputimages.append(Image.open(path))
+                inputnames.append(path)
+
+                custom_depthmap = None
+                if depthmap_batch_reuse:
+                    basename = Path(path).stem
+                    # Custom names are not used in samples directory
+                    if outpath != backbone.get_opt('outdir_extras_samples', None):
+                        # Possible filenames that the custom depthmaps may have
+                        name_candidates = [f'{basename}-0000.{backbone.get_opt("samples_format", "png")}',  # current format
+                                           f'{basename}.png',  # human-intuitive format
+                                           f'{Path(path).name}']  # human-intuitive format (worse)
+                        for fn_cand in name_candidates:
+                            path_cand = os.path.join(outpath, fn_cand)
+                            if os.path.isfile(path_cand):
+                                custom_depthmap = Image.open(os.path.abspath(path_cand))
+                                break
+                inputdepthmaps.append(custom_depthmap)
+            except Exception as e:
+                print(f'Failed to load {path}, ignoring. Exception: {str(e)}')
+        inputdepthmaps_n = len([1 for x in inputdepthmaps if x is not None])
+        print(f'{len(inputimages)} images will be processed, {inputdepthmaps_n} existing depthmaps will be reused')
+
+    outputs, fn_mesh, display_mesh = core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inputs, backbone.gather_ops())
+
+    # Saving images
+    show_images = []
+    for input_i, imgs in enumerate(outputs):
+        basename = 'depthmap'
+        if depthmap_mode == '2' and inputnames[input_i] is not None and outpath != backbone.get_opt('outdir_extras_samples', None):
+            basename = Path(inputnames[input_i]).stem
+
+        for image_type, image in list(imgs.items()):
+            show_images += [image]
+            if inputs["save_outputs"]:
+                try:
+                    suffix = "" if image_type == "depth" else f"{image_type}"
+                    backbone.save_image(image, path=outpath, basename=basename, seed=None,
+                               prompt=None, extension=backbone.get_opt('samples_format', 'png'), short_filename=True,
+                               no_prompt=True, grid=False, pnginfo_section_name="extras",
+                               suffix=suffix)
+                except Exception as e:
+                    if not ('image has wrong mode' in str(e) or 'I;16' in str(e)):
+                        raise e
+                    print('Catched exception: image has wrong mode!')
+                    traceback.print_exc()
+
+    display_mesh = None
+    # use inpainted 3d mesh to show in 3d model output when enabled in settings
+    if backbone.get_opt('depthmap_script_show_3d_inpaint', True) and fn_mesh is not None and len(fn_mesh) > 0:
+        display_mesh = fn_mesh
+    # however, don't show 3dmodel when disabled in settings
+    if not backbone.get_opt('depthmap_script_show_3d', True):
+        display_mesh = None
+    # TODO: return more info
+    return show_images, fn_mesh, display_mesh, 'Generated!'
diff --git a/src/core.py b/src/core.py
index ccc3e6e..32a81b1 100644
--- a/src/core.py
+++ b/src/core.py
@@ -1,11 +1,6 @@
 from pathlib import Path
-
 from PIL import Image
 
-from modules import shared, devices
-from modules.images import get_next_sequence_number
-from modules.shared import opts, cmd_opts
-
 try:
     from tqdm import trange
 except:
@@ -21,9 +16,10 @@
 import traceback
 
 # Our code
-from src.main import *
+from src.misc import *
 from src.stereoimage_generation import create_stereoimages
 from src.depthmap_generation import ModelHolder
+from src import backbone
 
 # 3d-photo-inpainting imports
 from inpaint.mesh import write_mesh, read_mesh, output_3d_photo
@@ -47,19 +43,7 @@ def convert_i16_to_rgb(image, like):
     return output
 
 
-def unload_sd_model():
-    if shared.sd_model is not None:
-        shared.sd_model.cond_stage_model.to(devices.cpu)
-        shared.sd_model.first_stage_model.to(devices.cpu)
-
-
-def reload_sd_model():
-    if shared.sd_model is not None:
-        shared.sd_model.cond_stage_model.to(devices.device)
-        shared.sd_model.first_stage_model.to(devices.device)
-
-
-def core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inp):
+def core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inp, ops=None):
     if len(inputimages) == 0 or inputimages[0] is None:
         return [], '', ''
     if inputdepthmaps is None or len(inputdepthmaps) == 0:
@@ -97,10 +81,14 @@ def core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inp
     stereo_modes = inp["stereo_modes"]
     stereo_separation = inp["stereo_separation"]
 
+    if ops is None:
+        ops = {}
+    model_holder.update_settings(**ops)
+
     # TODO: ideally, run_depthmap should not save meshes - that makes the function not pure
     print(SCRIPT_FULL_NAME)
 
-    unload_sd_model()
+    backbone.unload_sd_model()
 
     # TODO: this still should not be here
     background_removed_images = []
@@ -306,7 +294,7 @@ def core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inp
         else:
             raise e
     finally:
-        if hasattr(opts, 'depthmap_script_keepmodels') and opts.depthmap_script_keepmodels:
+        if backbone.get_opt('depthmap_script_keepmodels', True):
             model_holder.offload()  # Swap to CPU memory
         else:
             if 'model' in locals():
@@ -316,7 +304,7 @@ def core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inp
             model_holder.unload_models()
 
         gc.collect()
-        devices.torch_gc()
+        backbone.torch_gc()
 
     # TODO: This should not be here
     mesh_fi = None
@@ -326,14 +314,14 @@ def core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inp
         except Exception as e:
             print(f'{str(e)}, some issue with generating inpainted mesh')
 
-    reload_sd_model()
+    backbone.reload_sd_model()
     print("All done.\n")
     return generated_images, mesh_fi, meshsimple_fi
 
 
 def get_uniquefn(outpath, basename, ext):
     # Inefficient and may fail, maybe use unbounded binary search?
-    basecount = get_next_sequence_number(outpath, basename)
+    basecount = backbone.get_next_sequence_number(outpath, basename)
     if basecount > 0: basecount = basecount - 1
     fullfn = None
     for i in range(500):
@@ -401,10 +389,7 @@ def run_3dphoto(device, img_rgb, img_depth, inputnames, outpath, inpaint_vids, v
         config['repeat_inpaint_edge'] = True
         config['ply_fmt'] = "bin"
 
-        config['save_ply'] = False
-        if hasattr(opts, 'depthmap_script_save_ply') and opts.depthmap_script_save_ply:
-            config['save_ply'] = True
-
+        config['save_ply'] = backbone.get_opt('depthmap_script_save_ply', False)
         config['save_obj'] = True
 
         if device == torch.device("cpu"):
@@ -471,7 +456,7 @@ def run_3dphoto(device, img_rgb, img_depth, inputnames, outpath, inpaint_vids, v
                                    [-0.05, -0.05, -0.05, -0.05],
                                    ['dolly-zoom-in', 'zoom-in', 'circle', 'swing'], False, vid_format, vid_ssaa)
 
-            devices.torch_gc()
+            backbone.torch_gc()
 
     finally:
         del rgb_model
@@ -480,7 +465,7 @@ def run_3dphoto(device, img_rgb, img_depth, inputnames, outpath, inpaint_vids, v
         depth_edge_model = None
         del depth_feat_model
         depth_feat_model = None
-        devices.torch_gc()
+        backbone.torch_gc()
 
     return mesh_fi
 
@@ -602,9 +587,9 @@ def run_makevideo(fn_mesh, vid_numframes, vid_fps, vid_traj, vid_shift, vid_bord
 
     # output path and filename mess ..
     basename = Path(fn_mesh).stem
-    outpath = opts.outdir_samples or opts.outdir_extras_samples
+    outpath = backbone.get_outpath()
     # unique filename
-    basecount = get_next_sequence_number(outpath, basename)
+    basecount = backbone.get_next_sequence_number(outpath, basename)
     if basecount > 0: basecount = basecount - 1
     fullfn = None
     for i in range(500):
@@ -697,9 +682,7 @@ def depth_edges_mask(depth):
 def create_mesh(image, depth, keep_edges=False, spherical=False):
     import trimesh
     from dzoedepth.utils.geometry import depth_to_points, create_triangles
-    maxsize = 1024
-    if hasattr(opts, 'depthmap_script_mesh_maxsize'):
-        maxsize = opts.depthmap_script_mesh_maxsize
+    maxsize = backbone.get_opt('depthmap_script_mesh_maxsize', 2048)
 
     # limit the size of the input image
     image.thumbnail((maxsize, maxsize))
diff --git a/src/depthmap_generation.py b/src/depthmap_generation.py
index 6812d81..0ea4a37 100644
--- a/src/depthmap_generation.py
+++ b/src/depthmap_generation.py
@@ -1,42 +1,36 @@
+import gc
+import os.path
 from operator import getitem
 
-from PIL import Image
-from torchvision.transforms import Compose, transforms
-
-# TODO: depthmap_generation should not depend on WebUI
-from modules import shared, devices
-from modules.shared import opts, cmd_opts
-
-import torch, gc
 import cv2
-import os.path
 import numpy as np
 import skimage.measure
-
-# Our code
-from src.main import *
+from PIL import Image
+import torch
+from torchvision.transforms import Compose, transforms
 
 # midas imports
 from dmidas.dpt_depth import DPTDepthModel
 from dmidas.midas_net import MidasNet
 from dmidas.midas_net_custom import MidasNet_small
 from dmidas.transforms import Resize, NormalizeImage, PrepareForNet
-
+# zoedepth
+from dzoedepth.models.builder import build_model
+from dzoedepth.utils.config import get_config
 # AdelaiDepth/LeReS imports
 from lib.multi_depth_model_woauxi import RelDepthModel
 from lib.net_tools import strip_prefix_if_present
-
+from pix2pix.models.pix2pix4depth_model import Pix2Pix4DepthModel
 # pix2pix/merge net imports
 from pix2pix.options.test_options import TestOptions
-from pix2pix.models.pix2pix4depth_model import Pix2Pix4DepthModel
 
-# zoedepth
-from dzoedepth.models.builder import build_model
-from dzoedepth.utils.config import get_config
+# Our code
+from src.misc import *
+from src import backbone
 
-global device
+global depthmap_device
 
-class ModelHolder():
+class ModelHolder:
     def __init__(self):
         self.depth_model = None
         self.pix2pix_model = None
@@ -48,6 +42,20 @@ def __init__(self):
         self.resize_mode = None
         self.normalization = None
 
+        # Settings (initialized to sensible values, should be updated)
+        self.boost_whole_size_threshold = 1600  # R_max from the paper by default
+        self.no_half = False
+        self.precision = "autocast"
+
+    def update_settings(self, boost_whole_size_threshold=None, no_half=None, precision=None):
+        if boost_whole_size_threshold is not None:
+            self.boost_whole_size_threshold = boost_whole_size_threshold
+        if no_half is not None:
+            self.no_half = no_half
+        if precision is not None:
+            self.precision = precision
+
+
     def ensure_models(self, model_type, device: torch.device, boost: bool):
         # TODO: could make it more granular
         if model_type == -1 or model_type is None:
@@ -75,7 +83,6 @@ def load_models(self, model_type, device: torch.device, boost: bool):
         resize_mode = "minimal"
         normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
 
-        # TODO: net_w, net_h
         model = None
         if model_type == 0:  # "res101"
             model_path = f"{model_dir}/res101.pth"
@@ -93,7 +100,7 @@ def load_models(self, model_type, device: torch.device, boost: bool):
             model = RelDepthModel(backbone='resnext101')
             model.load_state_dict(strip_prefix_if_present(checkpoint['depth_model'], "module."), strict=True)
             del checkpoint
-            devices.torch_gc()
+            backbone.torch_gc()
 
         if model_type == 1:  # "dpt_beit_large_512" midas 3.1
             model_path = f"{model_dir}/dpt_beit_large_512.pt"
@@ -190,8 +197,8 @@ def load_models(self, model_type, device: torch.device, boost: bool):
         model.eval()  # prepare for evaluation
         # optimize
         if device == torch.device("cuda") and model_type in [0, 1, 2, 3, 4, 5, 6]:
-            model = model.to(memory_format=torch.channels_last)
-            if not cmd_opts.no_half and model_type != 0 and not boost:  # TODO: zoedepth, too?
+            model = model.to(memory_format=torch.channels_last)  # TODO: weird
+            if not self.no_half and model_type != 0 and not boost:  # TODO: zoedepth, too?
                 model = model.half()
         model.to(device)  # to correct device
 
@@ -217,11 +224,10 @@ def load_models(self, model_type, device: torch.device, boost: bool):
             self.pix2pix_model.load_networks('latest')
             self.pix2pix_model.eval()
 
-        devices.torch_gc()
+        backbone.torch_gc()
 
     @staticmethod
     def get_default_net_size(model_type):
-        # TODO: fill in, use in the GUI
         sizes = {
             0: [448, 448],
             1: [512, 512],
@@ -264,7 +270,7 @@ def unload_models(self):
             del self.pix2pix_model
             self.pix2pix_model = None
             gc.collect()
-            devices.torch_gc()
+            backbone.torch_gc()
 
         self.depth_model_type = None
         self.device = None
@@ -272,9 +278,8 @@ def unload_models(self):
     def get_raw_prediction(self, input, net_width, net_height):
         """Get prediction from the model currently loaded by the ModelHolder object.
         If boost is enabled, net_width and net_height will be ignored."""
-        # TODO: supply net size for zoedepth
-        global device
-        device = self.device
+        global depthmap_device
+        depthmap_device = self.device
         # input image
         img = cv2.cvtColor(np.asarray(input), cv2.COLOR_BGR2RGB) / 255.0
         # compute depthmap
@@ -285,9 +290,11 @@ def get_raw_prediction(self, input, net_width, net_height):
                 raw_prediction = estimatezoedepth(input, self.depth_model, net_width, net_height)
             else:
                 raw_prediction = estimatemidas(img, self.depth_model, net_width, net_height,
-                                               self.resize_mode, self.normalization)
+                                               self.resize_mode, self.normalization, self.no_half,
+                                               self.precision == "autocast")
         else:
-            raw_prediction = estimateboost(img, self.depth_model, self.depth_model_type, self.pix2pix_model)
+            raw_prediction = estimateboost(img, self.depth_model, self.depth_model_type, self.pix2pix_model,
+                                           self.boost_whole_size_threshold)
         raw_prediction_invert = self.depth_model_type in [0, 7, 8, 9]
         return raw_prediction, raw_prediction_invert
 
@@ -300,7 +307,7 @@ def estimateleres(img, model, w, h):
 
     # compute
     with torch.no_grad():
-        if device == torch.device("cuda"):
+        if depthmap_device == torch.device("cuda"):
             img_torch = img_torch.cuda()
         prediction = model.depth_model(img_torch)
 
@@ -332,7 +339,7 @@ def scale_torch(img):
 def estimatezoedepth(img, model, w, h):
     # x = transforms.ToTensor()(img).unsqueeze(0)
     # x = x.type(torch.float32)
-    # x.to(device)
+    # x.to(depthmap_device)
     # prediction = model.infer(x)
     model.core.prep.resizer._Resize__width = w
     model.core.prep.resizer._Resize__height = h
@@ -341,7 +348,7 @@ def estimatezoedepth(img, model, w, h):
     return prediction
 
 
-def estimatemidas(img, model, w, h, resize_mode, normalization):
+def estimatemidas(img, model, w, h, resize_mode, normalization, no_half, precision_is_autocast):
     import contextlib
     # init transform
     transform = Compose(
@@ -364,13 +371,13 @@ def estimatemidas(img, model, w, h, resize_mode, normalization):
     img_input = transform({"image": img})["image"]
 
     # compute
-    precision_scope = torch.autocast if shared.cmd_opts.precision == "autocast" and device == torch.device(
+    precision_scope = torch.autocast if precision_is_autocast and depthmap_device == torch.device(
         "cuda") else contextlib.nullcontext
     with torch.no_grad(), precision_scope("cuda"):
-        sample = torch.from_numpy(img_input).to(device).unsqueeze(0)
-        if device == torch.device("cuda"):
+        sample = torch.from_numpy(img_input).to(depthmap_device).unsqueeze(0)
+        if depthmap_device == torch.device("cuda"):
             sample = sample.to(memory_format=torch.channels_last)
-            if not cmd_opts.no_half:
+            if not no_half:
                 sample = sample.half()
         prediction = model.forward(sample)
         prediction = (
@@ -600,12 +607,8 @@ def parse(self):
         return self.opt
 
 
-def estimateboost(img, model, model_type, pix2pixmodel):
-    pix2pixsize = 1024  # TODO: to setting?
-    whole_size_threshold = 1600  # R_max from the paper  # TODO: to setting?
-    # get settings
-    if hasattr(opts, 'depthmap_script_boost_rmax'):
-        whole_size_threshold = opts.depthmap_script_boost_rmax
+def estimateboost(img, model, model_type, pix2pixmodel, whole_size_threshold):
+    pix2pixsize = 1024  # TODO: pix2pixsize and whole_size_threshold to setting?
 
     if model_type == 0:  # leres
         net_receptive_field_size = 448
@@ -618,7 +621,7 @@ def estimateboost(img, model, model_type, pix2pixmodel):
         patch_netsize = 2 * net_receptive_field_size
 
     gc.collect()
-    devices.torch_gc()
+    backbone.torch_gc()
 
     # Generate mask used to smoothly blend the local pathc estimations to the base estimate.
     # It is arbitrarily large to avoid artifacts during rescaling for each crop.
@@ -1024,8 +1027,8 @@ def estimatemidasBoost(img, model, w, h):
 
     # compute
     with torch.no_grad():
-        sample = torch.from_numpy(img_input).to(device).unsqueeze(0)
-        if device == torch.device("cuda"):
+        sample = torch.from_numpy(img_input).to(depthmap_device).unsqueeze(0)
+        if depthmap_device == torch.device("cuda"):
             sample = sample.to(memory_format=torch.channels_last)
         prediction = model.forward(sample)
 
diff --git a/src/main.py b/src/misc.py
similarity index 60%
rename from src/main.py
rename to src/misc.py
index d3fed1d..875211f 100644
--- a/src/main.py
+++ b/src/misc.py
@@ -1,27 +1,36 @@
 import subprocess
 import os
 import pathlib
-import torch
+import builtins
 
 def get_commit_hash():
-    try:
+    def call_git(dir):
         return subprocess.check_output(
             [os.environ.get("GIT", "git"), "rev-parse", "HEAD"],
-            cwd=pathlib.Path.cwd().joinpath('extensions/stable-diffusion-webui-depthmap-script/'),
-            shell=False,
-            stderr=subprocess.DEVNULL,
-            encoding='utf8').strip()[0:8]
+            cwd=dir, shell=False, stderr=subprocess.DEVNULL, encoding='utf8').strip()[0:8]
+
+    try:
+        file_path = pathlib.Path(__file__)
+        path = file_path.parts
+        while len(path) > 0 and path[-1] != REPOSITORY_NAME:
+            path = path[:-1]
+        if len(path) >= 2 and path[-1] == REPOSITORY_NAME and path[-2] == "extensions":
+            return call_git(str(pathlib.Path(*path)))
+
+        return call_git(pathlib.Path.cwd().joinpath('extensions/stable-diffusion-webui-depthmap-script/'))
     except Exception:
         return "<none>"
 
 
+REPOSITORY_NAME = "stable-diffusion-webui-depthmap-script"
 SCRIPT_NAME = "DepthMap"
-SCRIPT_VERSION = "v0.4.0"
+SCRIPT_VERSION = "v0.4.1"
 SCRIPT_FULL_NAME = f"{SCRIPT_NAME} {SCRIPT_VERSION} ({get_commit_hash()})"
 
 
 def ensure_file_downloaded(filename, url, sha256_hash_prefix=None):
-    # Do not check the hash every time - it is somewhat time-consuming
+    import torch
+    # Do not check the hash every time - it is somewhat time-consumin
     if os.path.exists(filename):
         return