diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..4f997759 --- /dev/null +++ b/.gitignore @@ -0,0 +1,216 @@ +# Created by https://www.toptal.com/developers/gitignore/api/macos,python +# Edit at https://www.toptal.com/developers/gitignore?templates=macos,python + +### macOS ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### macOS Patch ### +# iCloud generated files +*.icloud + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +# End of https://www.toptal.com/developers/gitignore/api/macos,python + + +data/** +checkpoints/** +output/** +.vscode/** +wildpose_data \ No newline at end of file diff --git a/README.md b/README.md index 6c4ea617..2f2b469b 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ ## Depth Pro: Sharp Monocular Metric Depth in Less Than a Second This software project accompanies the research paper: -**[Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/abs/2410.02073)**, +**[Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/abs/2410.02073)**, *Aleksei Bochkovskii, Amaƫl Delaunoy, Hugo Germain, Marcel Santos, Yichao Zhou, Stephan R. Richter, and Vladlen Koltun*. ![](data/depth-pro-teaser.jpg) @@ -32,7 +32,9 @@ source get_pretrained_models.sh # Files will be downloaded to `checkpoints` di We provide a helper script to directly run the model on a single image: ```bash # Run prediction on a single image: -depth-pro-run -i ./data/example.jpg +depth-pro-run \ + -i ./data/example.jpg \ + -o ./output # Run `depth-pro-run -h` for available options. ``` @@ -56,8 +58,23 @@ depth = prediction["depth"] # Depth in [m]. focallength_px = prediction["focallength_px"] # Focal length in pixels. ``` +### Visualize the depth image -### Evaluation (boundary metrics) +Visualize the depth image with Open3D. + +```bash +python src/tools/visualize_rgbd.py --input_depth output/depth.npz --input_image data/rgb.jpeg --focal_length 1500.0 +``` + +Visualize the depth image with Rerun. + +```bash +python src/tools/visualize_rerun.py --depth_file output/depth.npz --image_file data/rgb.jpeg +``` + +![](docs/imgs/rerun_screenshot.png) + +### Evaluation (boundary metrics) Our boundary metrics can be found under `eval/boundary_metrics.py` and used as follows: @@ -65,10 +82,16 @@ Our boundary metrics can be found under `eval/boundary_metrics.py` and used as f # for a depth-based dataset boundary_f1 = SI_boundary_F1(predicted_depth, target_depth) -# for a mask-based dataset (image matting / segmentation) +# for a mask-based dataset (image matting / segmentation) boundary_recall = SI_boundary_Recall(predicted_depth, target_mask) ``` +### Visualization + +```bash +python src/tools/visualize_rgbd.py --input_depth output/depth.npz --input_image data/rgb.jpeg --focal_length 1500.0 +``` + ## Citation diff --git a/docs/imgs/rerun_screenshot.png b/docs/imgs/rerun_screenshot.png new file mode 100644 index 00000000..fab0d071 Binary files /dev/null and b/docs/imgs/rerun_screenshot.png differ diff --git a/src/tools/visualize_rerun.py b/src/tools/visualize_rerun.py new file mode 100644 index 00000000..8880be5d --- /dev/null +++ b/src/tools/visualize_rerun.py @@ -0,0 +1,176 @@ +import argparse +import os +import sys + +import cv2 +import numpy as np +import rerun as rr +from loguru import logger +from tqdm import tqdm + + +def visualize_rgb_depth(rgb_image, depth_map, focal_length=None, timestamp=None): + """Visualize RGB and depth images with 3D visualization using Rerun's Pinhole camera model. + + Args: + ---- + rgb_image: RGB image (H, W, 3) + depth_map: Depth map (H, W) + frame_name: Name for this frame in Rerun + timestamp: Optional timestamp in seconds for timeline visualization + + """ + # Set the timeline timestamp if provided + if timestamp is not None: + rr.set_time_seconds("frame_time", timestamp) + + # Define the camera parameters using Pinhole model + h, w = depth_map.shape + focal_length = 0.7 * w if focal_length is None else focal_length + rr.log( + "world/camera/image", + rr.Pinhole( + resolution=[w, h], + focal_length=focal_length, + ), + ) + + # Log RGB image with compression + rr.log("world/camera/image/rgb", rr.Image(rgb_image).compress(jpeg_quality=95)) + + # Log depth image + # Note: You might need to adjust DEPTH_IMAGE_SCALING based on your depth units + DEPTH_IMAGE_SCALING = 1.0 # Adjust if your depth is not in meters + rr.log("world/camera/image/depth", rr.DepthImage(depth_map, meter=DEPTH_IMAGE_SCALING)) + + # Reset timeline + rr.reset_time() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Visualize depth maps using Rerun") + parser.add_argument( + "--depth_file", + type=str, + required=True, + help="Path to the depth maps NPZ or NPY file generated by run_video_depth_anything", + ) + parser.add_argument( + "--image_or_video", + type=str, + required=True, + help="Path to the original image or video file", + ) + parser.add_argument( + "--output_dir", + type=str, + default="./output/rerun_visualize", + help="Output directory for visualization files", + ) + parser.add_argument( + "--invert_depth", + action="store_true", + help="If the depth map is inverted, set this flag", + ) + parser.add_argument( + "--depth_scaling", + type=float, + default=1.0, + help="Depth scaling factor", + ) + parser.add_argument( + "--focal_length", + type=float, + default=None, + help="Focal length of the camera", + ) + parser.add_argument( + "--interactive", + action="store_true", + help="Run Rerun in interactive mode instead of recording", + ) + + # Add Rerun-specific arguments + rr.script_add_args(parser) + args = parser.parse_args() + + # Configure logger + logger.remove() + logger.add(sys.stderr, level="INFO") + logger.add(f"{args.output_dir}/log.txt", level="DEBUG", rotation="10 MB") + + # Create output directory + os.makedirs(args.output_dir, exist_ok=True) + + # Initialize Rerun + recording_id = os.path.basename(args.depth_file).split(".")[0] + rr.init(recording_id, spawn=args.interactive) + + # Load depth maps + logger.info(f"Loading depth maps from {args.depth_file}") + if args.depth_file.endswith(".npz"): + with np.load(args.depth_file) as data: + depth_maps = data["depth"] + elif args.depth_file.endswith(".npy"): + with open(args.depth_file, "rb") as f: + depth_maps = np.load(f) + if args.invert_depth: + depth_maps = 1.0 / depth_maps + depth_maps = depth_maps * args.depth_scaling + logger.info(f"Loaded depth maps: {depth_maps.shape}") + + if args.image_or_video.endswith(".mp4"): # Video mode + assert len(depth_maps.shape) == 3, "Depth map must be 3D for video mode" + # Load video + cap = cv2.VideoCapture(args.image_or_video) + assert depth_maps.shape[0] == cap.get( + cv2.CAP_PROP_FRAME_COUNT + ), "Depth map and video must have the same number of frames" + # get video info + n_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + fps = cap.get(cv2.CAP_PROP_FPS) + width = cap.get(cv2.CAP_PROP_FRAME_WIDTH) + height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT) + duration = n_frames / fps + # visualize + for i in tqdm(range(n_frames), desc="Visualizing video"): + ret, frame = cap.read() + if not ret: + logger.warning(f"Failed to read frame {i}") + break + rgb_image = frame + depth_map = depth_maps[i] + visualize_rgb_depth( + rgb_image, depth_map, focal_length=args.focal_length, timestamp=i / fps + ) + cap.release() + else: # Image mode + assert len(depth_maps.shape) == 2, "Depth map must be 2D for image mode" + # Load RGB image + rgb_image = cv2.imread(args.image_or_video) + + # Use the simplified visualization without 3D point cloud + visualize_rgb_depth( + rgb_image=rgb_image, + depth_map=depth_maps, + focal_length=args.focal_length, + timestamp=0, + ) + + # If not in interactive mode, save the recording + if not args.interactive: + logger.info(f"Saving Rerun recording to {args.output_dir}/rerun_recording.rrd") + rr.save(f"{args.output_dir}/rerun_recording.rrd") + logger.info("Visualization complete") + else: + logger.info("Interactive visualization running. Press Ctrl+C to exit.") + # Keep the program running for interactive visualization + try: + import time + + while True: + time.sleep(1) + except KeyboardInterrupt: + logger.info("Exiting interactive visualization") + + logger.info(f"All output saved to {args.output_dir}") diff --git a/src/tools/visualize_rgbd.py b/src/tools/visualize_rgbd.py new file mode 100644 index 00000000..312aa77b --- /dev/null +++ b/src/tools/visualize_rgbd.py @@ -0,0 +1,70 @@ +import argparse + +import numpy as np +import open3d as o3d +from PIL import Image + + +def parse_args(): + parser = argparse.ArgumentParser(description="Visualize depth maps") + parser.add_argument( + "--input_depth", type=str, required=True, help="Path to the input depth map (npz file)" + ) + parser.add_argument( + "--input_image", type=str, required=True, help="Path to the input image (png file)" + ) + parser.add_argument( + "--focal_length", type=float, default=1500.0, help="Focal length of the camera" + ) + return parser.parse_args() + + +def visualize_depth_with_open3d(depth, image_rgb, focal_length=2000.0): + # Convert numpy arrays to Open3D Image objects + color_image = o3d.geometry.Image(image_rgb) + depth_image = o3d.geometry.Image(depth) + + rgbd = o3d.geometry.RGBDImage.create_from_color_and_depth( + color_image, + depth_image, + depth_scale=1.0, # Adjust this value based on your depth scale + depth_trunc=100.0, # Adjust this value based on your depth range + convert_rgb_to_intensity=False, + ) + + pcd = o3d.geometry.PointCloud.create_from_rgbd_image( + rgbd, + o3d.camera.PinholeCameraIntrinsic( + width=depth.shape[1], + height=depth.shape[0], + fx=focal_length, + fy=focal_length, + cx=depth.shape[1] / 2, + cy=depth.shape[0] / 2, + ), + ) + + # flip the orientation, so it looks upright, not upside-down + pcd.transform([[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]]) + + # Create coordinate frame + coordinate_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.1, origin=[0, 0, 0]) + # Visualize the point cloud with coordinate frame + # Set a smaller point size for the point cloud + o3d.visualization.draw_geometries( + [pcd, coordinate_frame], point_show_normal=True + ) # Reduced point size to 1 + + +if __name__ == "__main__": + args = parse_args() + + # load depth map + with np.load(args.input_depth) as data: + depth = data["depth"] + + # load image + img_pil = Image.open(args.input_image).convert("RGB") + image = np.array(img_pil) + + visualize_depth_with_open3d(depth, image, focal_length=args.focal_length) diff --git a/v1.3_data b/v1.3_data new file mode 120000 index 00000000..e5d5a2a6 --- /dev/null +++ b/v1.3_data @@ -0,0 +1 @@ +../WildPose_v1.3/data \ No newline at end of file