From b958f33924e015949718c30663cd3fda7a5a6444 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tobias=20J=C3=BClg?= <tobias.juelg@utn.de>
Date: Tue, 3 Jun 2025 08:54:12 +0200
Subject: [PATCH 1/3] refactor: replace pydantic with dataclasses

---
 pyproject.toml                  | 1 -
 python/rcs/camera/interface.py  | 8 ++------
 python/rcs/hand/tilburg_hand.py | 5 +++--
 3 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b4107988..d3b5e7e3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,6 @@ dependencies = ["websockets>=11.0",
     "requests~=2.31",
     "numpy",
     "typer~=0.9",
-    "pydantic~=2.6",
     "gymnasium~=0.29.1",
     "pydantic_yaml~=1.3",
     "absl-py~=2.1",
diff --git a/python/rcs/camera/interface.py b/python/rcs/camera/interface.py
index e7ab9257..d667ac2b 100644
--- a/python/rcs/camera/interface.py
+++ b/python/rcs/camera/interface.py
@@ -4,7 +4,6 @@
 from time import sleep, time
 from typing import Any, Protocol
 
-from pydantic import BaseModel, Field
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -36,12 +35,9 @@ def __call__(self, frame_rate: int | float):
         self.t = time()
 
 
-class BaseCameraConfig(BaseModel):
+@dataclass(kw_only=True)
+class BaseCameraConfig:
     identifier: str
-
-
-class BaseCameraSetConfig(BaseModel):
-    cameras: dict = Field(default={})
     resolution_width: int = 1280  # pixels
     resolution_height: int = 720  # pixels
     frame_rate: int = 15  # Hz
diff --git a/python/rcs/hand/tilburg_hand.py b/python/rcs/hand/tilburg_hand.py
index af0aa6a6..14cef140 100644
--- a/python/rcs/hand/tilburg_hand.py
+++ b/python/rcs/hand/tilburg_hand.py
@@ -1,9 +1,9 @@
 import copy
+from dataclasses import dataclass
 import logging
 from time import sleep
 
 import numpy as np
-from pydantic import BaseModel
 from rcs.envs.space_utils import Vec18Type
 from rcs.hand.interface import BaseHand
 from tilburg_hand import Finger, TilburgHandMotorInterface, Unit
@@ -14,7 +14,8 @@
 logger.disabled = False
 
 
-class THConfig(BaseModel):
+@dataclass(kw_only=True)
+class THConfig:
     """Config for the Tilburg hand"""
 
     calibration_file: str | None = None

From 9bd3618fe2954ee59f9546b5da307f0fa10ad587 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tobias=20J=C3=BClg?= <tobias.juelg@utn.de>
Date: Tue, 3 Jun 2025 14:45:01 +0200
Subject: [PATCH 2/3] refactor(camera): config and camera set

put base camera config into cpp code
---
 python/rcs/_core/common.pyi |  8 ++++++++
 python/rcs/_core/sim.pyi    | 22 ++--------------------
 src/common/Camera.h         | 24 ++++++++++++++++++++++++
 src/pybind/rcs.cpp          | 35 +++++++++++++++++------------------
 src/sim/camera.cpp          | 24 ++++++++++++++----------
 src/sim/camera.h            | 19 ++++++++-----------
 6 files changed, 73 insertions(+), 59 deletions(-)
 create mode 100644 src/common/Camera.h

diff --git a/python/rcs/_core/common.pyi b/python/rcs/_core/common.pyi
index ef6ded08..42ec9d5b 100644
--- a/python/rcs/_core/common.pyi
+++ b/python/rcs/_core/common.pyi
@@ -10,6 +10,7 @@ import numpy
 import pybind11_stubgen.typing_ext
 
 __all__ = [
+    "BaseCameraConfig",
     "FR3",
     "FrankaHandTCPOffset",
     "Gripper",
@@ -35,6 +36,13 @@ __all__ = [
 M = typing.TypeVar("M", bound=int)
 N = typing.TypeVar("N", bound=int)
 
+class BaseCameraConfig:
+    frame_rate: int
+    identifier: str
+    resolution_height: int
+    resolution_width: int
+    def __init__(self, identifier: str, frame_rate: int, resolution_width: int, resolution_height: int) -> None: ...
+
 class Gripper:
     def get_normalized_width(self) -> float: ...
     def get_parameters(self) -> GripperConfig: ...
diff --git a/python/rcs/_core/sim.pyi b/python/rcs/_core/sim.pyi
index ff64f481..dce9fb89 100644
--- a/python/rcs/_core/sim.pyi
+++ b/python/rcs/_core/sim.pyi
@@ -15,7 +15,6 @@ __all__ = [
     "Sim",
     "SimCameraConfig",
     "SimCameraSet",
-    "SimCameraSetConfig",
     "SimGripper",
     "SimGripperConfig",
     "SimGripperState",
@@ -83,13 +82,11 @@ class Sim:
     def step(self, k: int) -> None: ...
     def step_until_convergence(self) -> None: ...
 
-class SimCameraConfig:
-    identifier: str
+class SimCameraConfig(rcs._core.common.BaseCameraConfig):
     type: CameraType
-    def __init__(self) -> None: ...
 
 class SimCameraSet:
-    def __init__(self, sim: Sim, cfg: SimCameraSetConfig) -> None: ...
+    def __init__(self, sim: Sim, cameras: dict[str, SimCameraConfig], render_on_demand: bool = True) -> None: ...
     def buffer_size(self) -> int: ...
     def clear_buffer(self) -> None: ...
     def get_latest_frameset(self) -> FrameSet | None: ...
@@ -97,21 +94,6 @@ class SimCameraSet:
     @property
     def _sim(self) -> Sim: ...
 
-class SimCameraSetConfig:
-    cameras: dict[str, SimCameraConfig]
-    max_buffer_frames: int
-    resolution_height: int
-    resolution_width: int
-    def __init__(self) -> None: ...
-    @property
-    def frame_rate(self) -> int:
-        """
-        The frame rate in which the cameras render in Hz. If set to zero, the camera frames will render on demand and without fixed rate which takes away compute effort.
-        """
-
-    @frame_rate.setter
-    def frame_rate(self, arg0: int) -> None: ...
-
 class SimGripper(rcs._core.common.Gripper):
     def __init__(self, sim: Sim, cfg: SimGripperConfig) -> None: ...
     def get_parameters(self) -> SimGripperConfig: ...
diff --git a/src/common/Camera.h b/src/common/Camera.h
new file mode 100644
index 00000000..3077da19
--- /dev/null
+++ b/src/common/Camera.h
@@ -0,0 +1,24 @@
+#ifndef RCS_CAMERA_H
+#define RCS_CAMERA_H
+#include <string>
+
+namespace rcs {
+namespace common {
+
+struct BaseCameraConfig {
+  std::string identifier;
+  int frame_rate;
+  int resolution_width;
+  int resolution_height;
+  BaseCameraConfig(const std::string &identifier, int frame_rate,
+                   int resolution_width, int resolution_height = 480)
+      : identifier(identifier),
+        frame_rate(frame_rate),
+        resolution_width(resolution_width),
+        resolution_height(resolution_height) {}
+};
+
+}  // namespace common
+}  // namespace rcs
+
+#endif  // RCS_CAMERA_H
\ No newline at end of file
diff --git a/src/pybind/rcs.cpp b/src/pybind/rcs.cpp
index 0514393c..87fc9966 100644
--- a/src/pybind/rcs.cpp
+++ b/src/pybind/rcs.cpp
@@ -145,6 +145,17 @@ PYBIND11_MODULE(_core, m) {
   common.def("IdentityRotQuatVec", &rcs::common::IdentityRotQuatVec);
   common.def("FrankaHandTCPOffset", &rcs::common::FrankaHandTCPOffset);
 
+  py::class_<rcs::common::BaseCameraConfig>(common, "BaseCameraConfig")
+      .def(py::init<const std::string &, int, int, int>(),
+           py::arg("identifier"), py::arg("frame_rate"),
+           py::arg("resolution_width"), py::arg("resolution_height"))
+      .def_readwrite("identifier", &rcs::common::BaseCameraConfig::identifier)
+      .def_readwrite("frame_rate", &rcs::common::BaseCameraConfig::frame_rate)
+      .def_readwrite("resolution_width",
+                     &rcs::common::BaseCameraConfig::resolution_width)
+      .def_readwrite("resolution_height",
+                     &rcs::common::BaseCameraConfig::resolution_height);
+
   py::class_<rcs::common::RPY>(common, "RPY")
       .def(py::init<double, double, double>(), py::arg("roll") = 0.0,
            py::arg("pitch") = 0.0, py::arg("yaw") = 0.0)
@@ -505,23 +516,9 @@ PYBIND11_MODULE(_core, m) {
       .value("fixed", rcs::sim::CameraType::fixed)
       .value("default_free", rcs::sim::CameraType::default_free)
       .export_values();
-  py::class_<rcs::sim::SimCameraConfig>(sim, "SimCameraConfig")
-      .def(py::init<>())
-      .def_readwrite("identifier", &rcs::sim::SimCameraConfig::identifier)
+  py::class_<rcs::sim::SimCameraConfig, rcs::common::BaseCameraConfig>(
+      sim, "SimCameraConfig")
       .def_readwrite("type", &rcs::sim::SimCameraConfig::type);
-  py::class_<rcs::sim::SimCameraSetConfig>(sim, "SimCameraSetConfig")
-      .def(py::init<>())
-      .def_readwrite("cameras", &rcs::sim::SimCameraSetConfig::cameras)
-      .def_readwrite("frame_rate", &rcs::sim::SimCameraSetConfig::frame_rate,
-                     "The frame rate in which the cameras render in Hz. If set "
-                     "to zero, the camera frames will render on demand and "
-                     "without fixed rate which takes away compute effort.")
-      .def_readwrite("resolution_width",
-                     &rcs::sim::SimCameraSetConfig::resolution_width)
-      .def_readwrite("resolution_height",
-                     &rcs::sim::SimCameraSetConfig::resolution_height)
-      .def_readwrite("max_buffer_frames",
-                     &rcs::sim::SimCameraSetConfig::max_buffer_frames);
   py::class_<rcs::sim::FrameSet>(sim, "FrameSet")
       .def(py::init<>())
       .def_readonly("color_frames", &rcs::sim::FrameSet::color_frames)
@@ -529,8 +526,10 @@ PYBIND11_MODULE(_core, m) {
       .def_readonly("timestamp", &rcs::sim::FrameSet::timestamp);
   py::class_<rcs::sim::SimCameraSet>(sim, "SimCameraSet")
       .def(py::init<std::shared_ptr<rcs::sim::Sim>,
-                    rcs::sim::SimCameraSetConfig>(),
-           py::arg("sim"), py::arg("cfg"))
+                    std::unordered_map<std::string, rcs::sim::SimCameraConfig>,
+                    bool>(),
+           py::arg("sim"), py::arg("cameras"),
+           py::arg("render_on_demand") = true)
       .def("buffer_size", &rcs::sim::SimCameraSet::buffer_size)
       .def("clear_buffer", &rcs::sim::SimCameraSet::clear_buffer)
       .def("get_latest_frameset", &rcs::sim::SimCameraSet::get_latest_frameset)
diff --git a/src/sim/camera.cpp b/src/sim/camera.cpp
index 6f70233f..d7de4ed3 100644
--- a/src/sim/camera.cpp
+++ b/src/sim/camera.cpp
@@ -21,17 +21,21 @@
 namespace rcs {
 namespace sim {
 
-SimCameraSet::SimCameraSet(std::shared_ptr<Sim> sim, SimCameraSetConfig cfg)
-    : sim{sim}, cfg{cfg}, buffer{}, buffer_lock{}, cameras{} {
-  for (auto const& [id, cam] : cfg.cameras) {
-    // if frame_rate is zero, then we only render when an image is requested
-    // this mode is useful when no videos are required as it speeds up the
-    // simulation significantly
+SimCameraSet::SimCameraSet(
+    std::shared_ptr<Sim> sim,
+    std::unordered_map<std::string, SimCameraConfig> cameras_cfg,
+    bool render_on_demand)
+    : sim{sim},
+      cameras_cfg{cameras_cfg},
+      buffer{},
+      buffer_lock{},
+      cameras{},
+      render_on_demand{render_on_demand} {
+  for (auto const& [id, cam] : cameras_cfg) {
     this->sim->register_rendering_callback(
         [this](const std::string& id, mjrContext& ctx, mjvScene& scene,
                mjvOption& opt) { this->frame_callback(id, ctx, scene, opt); },
-        id, this->cfg.frame_rate, this->cfg.resolution_width,
-        this->cfg.resolution_height);
+        id, cam.frame_rate, cam.resolution_width, cam.resolution_height);
 
     mjvCamera mjcam;
     mjv_defaultCamera(&mjcam);
@@ -60,7 +64,7 @@ void SimCameraSet::clear_buffer() {
 }
 
 std::optional<FrameSet> SimCameraSet::get_latest_frameset() {
-  if (this->cfg.frame_rate == 0) {
+  if (this->render_on_demand) {
     this->render_all();
   }
   if (buffer.empty()) {
@@ -80,7 +84,7 @@ std::optional<FrameSet> SimCameraSet::get_timestamp_frameset(float ts) {
 }
 
 void SimCameraSet::render_all() {
-  for (auto const& [id, cam] : this->cfg.cameras) {
+  for (auto const& [id, cam] : this->cameras_cfg) {
     mjrContext* ctx = this->sim->renderer.get_context(id);
     this->frame_callback(id, *ctx, this->sim->renderer.scene,
                          this->sim->renderer.opt);
diff --git a/src/sim/camera.h b/src/sim/camera.h
index a9960d69..1fb7eef2 100644
--- a/src/sim/camera.h
+++ b/src/sim/camera.h
@@ -10,6 +10,7 @@
 #include <set>
 #include <unordered_map>
 
+#include "common/Camera.h"
 #include "sim/sim.h"
 
 namespace rcs {
@@ -21,17 +22,10 @@ enum CameraType {
   fixed = mjCAMERA_FIXED,
   default_free
 };
-struct SimCameraConfig {
-  std::string identifier;
+
+struct SimCameraConfig : common::BaseCameraConfig {
   CameraType type;
 };
-struct SimCameraSetConfig {
-  std::unordered_map<std::string, SimCameraConfig> cameras;
-  int frame_rate;
-  int resolution_width;
-  int resolution_height;
-  int max_buffer_frames;
-};
 
 // (H,W,3)
 typedef Eigen::Matrix<unsigned char, Eigen::Dynamic, 1> ColorFrame;
@@ -45,7 +39,9 @@ struct FrameSet {
 
 class SimCameraSet {
  public:
-  SimCameraSet(std::shared_ptr<rcs::sim::Sim> sim, SimCameraSetConfig cfg);
+  SimCameraSet(std::shared_ptr<rcs::sim::Sim> sim,
+               std::unordered_map<std::string, SimCameraConfig> cameras,
+               bool render_on_demand = true);
   ~SimCameraSet();
 
   int buffer_size();
@@ -58,10 +54,11 @@ class SimCameraSet {
                       mjvOption& opt);
 
   std::shared_ptr<Sim> get_sim() { return sim; }
+  bool render_on_demand;
 
  private:
   std::shared_ptr<Sim> sim;
-  const SimCameraSetConfig cfg;
+  std::unordered_map<std::string, SimCameraConfig> cameras_cfg;
   std::vector<FrameSet> buffer;
   std::unordered_map<std::string, mjvCamera> cameras;
   std::mutex buffer_lock;

From e7f839ebf08fdd47028377ee0ddeed30d754e38d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tobias=20J=C3=BClg?= <tobias.juelg@utn.de>
Date: Wed, 4 Jun 2025 20:59:53 +0200
Subject: [PATCH 3/3] refactor(camera): hardware camera with composition and
 digit as camera

- each camera can be configured individually
- HardwareCameraSet uses HardwareCamera interface with composition instead of inheritance
- BaseCameraConfig moved to cpp
- removed digit camera set wrapper as camera set wrapper is used now
---
 pyproject.toml                                |   1 -
 python/examples/env_cartesian_control.py      |   2 +-
 .../examples/env_cartesian_control_digit.py   |   6 +-
 .../examples/env_cartesian_control_tilburg.py |   2 +-
 python/examples/env_joint_control.py          |   2 +-
 python/examples/fr3.py                        |  21 ++-
 python/rcs/_core/sim.pyi                      |   3 +
 python/rcs/camera/digit_cam.py                |  47 +++---
 python/rcs/camera/hw.py                       | 157 +++++++++++-------
 python/rcs/camera/interface.py                |  33 ++--
 python/rcs/camera/kinect.py                   |  51 +++---
 python/rcs/camera/realsense.py                | 127 +++++++-------
 python/rcs/camera/sim.py                      |  84 +++-------
 python/rcs/envs/base.py                       |  59 +------
 python/rcs/envs/creators.py                   | 134 +++++++++------
 python/rcs/envs/utils.py                      |  63 ++++---
 python/rcs/hand/tilburg_hand.py               |   2 +-
 python/tests/test_sim_envs.py                 |  42 +++--
 src/common/Camera.h                           |   3 +-
 src/pybind/rcs.cpp                            |   4 +
 src/sim/camera.h                              |   6 +
 21 files changed, 427 insertions(+), 422 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index d3b5e7e3..8f28490d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,6 @@ dependencies = ["websockets>=11.0",
     "numpy",
     "typer~=0.9",
     "gymnasium~=0.29.1",
-    "pydantic_yaml~=1.3",
     "absl-py~=2.1",
     "etils[epath]>=1.7.0",
     "glfw~=2.7",
diff --git a/python/examples/env_cartesian_control.py b/python/examples/env_cartesian_control.py
index a544d7a9..f00b8d3d 100644
--- a/python/examples/env_cartesian_control.py
+++ b/python/examples/env_cartesian_control.py
@@ -67,7 +67,7 @@ def main():
                 robot_cfg=default_fr3_sim_robot_cfg(),
                 collision_guard=False,
                 gripper_cfg=default_fr3_sim_gripper_cfg(),
-                camera_set_cfg=default_mujoco_cameraset_cfg(),
+                cameras=default_mujoco_cameraset_cfg(),
                 max_relative_movement=0.5,
                 relative_to=RelativeTo.LAST_STEP,
             )
diff --git a/python/examples/env_cartesian_control_digit.py b/python/examples/env_cartesian_control_digit.py
index c9e5e3a3..a86e0fa9 100644
--- a/python/examples/env_cartesian_control_digit.py
+++ b/python/examples/env_cartesian_control_digit.py
@@ -13,6 +13,8 @@
     default_mujoco_cameraset_cfg,
 )
 
+from python.rcs.camera.hw import HardwareCameraSet
+
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
@@ -57,8 +59,8 @@ def main():
                 control_mode=ControlMode.CARTESIAN_TQuat,
                 robot_cfg=default_fr3_hw_robot_cfg(),
                 collision_guard="lab",
+                camera_set=HardwareCameraSet(default_digit({"digit_0": "D21182"})),
                 gripper_cfg=default_fr3_hw_gripper_cfg(),
-                digit_set=default_digit({"digit_0": "D21182"}),
                 max_relative_movement=0.5,
                 relative_to=RelativeTo.LAST_STEP,
             )
@@ -68,7 +70,7 @@ def main():
                 robot_cfg=default_fr3_sim_robot_cfg(),
                 collision_guard=False,
                 gripper_cfg=default_fr3_sim_gripper_cfg(),
-                camera_set_cfg=default_mujoco_cameraset_cfg(),
+                cameras=default_mujoco_cameraset_cfg(),
                 max_relative_movement=0.5,
                 relative_to=RelativeTo.LAST_STEP,
             )
diff --git a/python/examples/env_cartesian_control_tilburg.py b/python/examples/env_cartesian_control_tilburg.py
index 052920a7..958ee5b8 100644
--- a/python/examples/env_cartesian_control_tilburg.py
+++ b/python/examples/env_cartesian_control_tilburg.py
@@ -66,7 +66,7 @@ def main():
                 robot_cfg=default_fr3_sim_robot_cfg(),
                 collision_guard=False,
                 gripper_cfg=default_fr3_sim_gripper_cfg(),
-                camera_set_cfg=default_mujoco_cameraset_cfg(),
+                cameras=default_mujoco_cameraset_cfg(),
                 max_relative_movement=0.5,
                 relative_to=RelativeTo.LAST_STEP,
             )
diff --git a/python/examples/env_joint_control.py b/python/examples/env_joint_control.py
index dbcd1df1..941ae338 100644
--- a/python/examples/env_joint_control.py
+++ b/python/examples/env_joint_control.py
@@ -68,7 +68,7 @@ def main():
                 collision_guard=False,
                 robot_cfg=default_fr3_sim_robot_cfg(),
                 gripper_cfg=default_fr3_sim_gripper_cfg(),
-                camera_set_cfg=default_mujoco_cameraset_cfg(),
+                cameras=default_mujoco_cameraset_cfg(),
                 max_relative_movement=np.deg2rad(5),
                 relative_to=RelativeTo.LAST_STEP,
             )
diff --git a/python/examples/fr3.py b/python/examples/fr3.py
index 2d9b8eee..e101d333 100644
--- a/python/examples/fr3.py
+++ b/python/examples/fr3.py
@@ -7,7 +7,7 @@
 from rcs._core.common import RobotPlatform
 from rcs._core.hw import FR3Config, IKSolver
 from rcs._core.sim import CameraType
-from rcs.camera.sim import SimCameraConfig, SimCameraSet, SimCameraSetConfig
+from rcs.camera.sim import SimCameraConfig, SimCameraSet
 from rcs.control.fr3_desk import FCI, ContextManager, Desk, load_creds_fr3_desk
 from rcs.envs.creators import get_urdf_path
 
@@ -80,11 +80,22 @@ def main():
 
             # add camera to have a rendering gui
             cameras = {
-                "default_free": SimCameraConfig(identifier="", type=int(CameraType.default_free)),
-                "wrist": SimCameraConfig(identifier="wrist_0", type=int(CameraType.fixed)),
+                "default_free": SimCameraConfig(
+                    identifier="",
+                    type=CameraType.default_free,
+                    resolution_width=1280,
+                    resolution_height=720,
+                    frame_rate=20,
+                ),
+                "wrist": SimCameraConfig(
+                    identifier="wrist_0",
+                    type=CameraType.fixed,
+                    resolution_width=640,
+                    resolution_height=480,
+                    frame_rate=30,
+                ),
             }
-            cam_cfg = SimCameraSetConfig(cameras=cameras, resolution_width=1280, resolution_height=720, frame_rate=20)
-            camera_set = SimCameraSet(simulation, cam_cfg)  # noqa: F841
+            camera_set = SimCameraSet(simulation, cameras)  # noqa: F841
             simulation.open_gui()
 
         else:
diff --git a/python/rcs/_core/sim.pyi b/python/rcs/_core/sim.pyi
index dce9fb89..e4c6e04e 100644
--- a/python/rcs/_core/sim.pyi
+++ b/python/rcs/_core/sim.pyi
@@ -84,6 +84,9 @@ class Sim:
 
 class SimCameraConfig(rcs._core.common.BaseCameraConfig):
     type: CameraType
+    def __init__(
+        self, identifier: str, frame_rate: int, resolution_width: int, resolution_height: int, type: CameraType = ...
+    ) -> None: ...
 
 class SimCameraSet:
     def __init__(self, sim: Sim, cameras: dict[str, SimCameraConfig], render_on_demand: bool = True) -> None: ...
diff --git a/python/rcs/camera/digit_cam.py b/python/rcs/camera/digit_cam.py
index 38bcc0ef..3a1be493 100644
--- a/python/rcs/camera/digit_cam.py
+++ b/python/rcs/camera/digit_cam.py
@@ -1,40 +1,36 @@
 from digit_interface.digit import Digit
-from rcs.camera.hw import BaseHardwareCameraSet, HWCameraSetConfig
+from rcs._core.common import BaseCameraConfig
+from rcs.camera.hw import HardwareCamera
 from rcs.camera.interface import CameraFrame, DataFrame, Frame
 
 
-class DigitConfig(HWCameraSetConfig):
-    """
-    Configuration for the DIGIT device.
-    This class is used to define the settings for the DIGIT device.
-    """
-
-    stream_name: str = "QVGA"  # options: "QVGA" (60 and 30 fps), "VGA" (30 and 15 fps)
-
-
-class DigitCam(BaseHardwareCameraSet):
+class DigitCam(HardwareCamera):
     """
     This module provides an interface to interact with the DIGIT device.
     It allows for connecting to the device, changing settings, and retrieving information.
     """
 
-    def __init__(self, cfg: DigitConfig):
-        self._cfg = cfg
-        super().__init__()
+    def __init__(self, cameras: dict[str, BaseCameraConfig]):
+        self.cameras = cameras
+        self._camera_names = list(self.cameras.keys())
         self._cameras: dict[str, Digit] = {}
-        self.initalize(self.config)
 
-    def initalize(self, cfg: HWCameraSetConfig):
+    def open(self):
         """
         Initialize the digit interface with the given configuration.
         :param cfg: Configuration for the DIGIT device.
         """
-        for name, serial in cfg.name_to_identifier.items():
-            digit = Digit(serial, name)
+        for name, camera in self.cameras.items():
+            digit = Digit(camera.identifier, name)
             digit.connect()
             self._cameras[name] = digit
 
-    def _poll_frame(self, camera_name: str) -> Frame:
+    @property
+    def camera_names(self) -> list[str]:
+        """Returns the names of the cameras in this set."""
+        return self._camera_names
+
+    def poll_frame(self, camera_name: str) -> Frame:
         """Polls the frame from the camera with the given name."""
         digit = self._cameras[camera_name]
         frame = digit.get_frame()
@@ -45,6 +41,13 @@ def _poll_frame(self, camera_name: str) -> Frame:
 
         return Frame(camera=cf)
 
-    @property
-    def config(self) -> DigitConfig:
-        return self._cfg
+    def close(self):
+        """
+        Closes the connection to the DIGIT device.
+        """
+        for digit in self._cameras.values():
+            digit.disconnect()
+        self._cameras = {}
+
+    def config(self, camera_name) -> BaseCameraConfig:
+        return self.cameras[camera_name]
diff --git a/python/rcs/camera/hw.py b/python/rcs/camera/hw.py
index 6aee0aa0..dc3ae1f5 100644
--- a/python/rcs/camera/hw.py
+++ b/python/rcs/camera/hw.py
@@ -1,39 +1,57 @@
 import logging
 import threading
 import typing
-from abc import ABC, abstractmethod
+from collections.abc import Sequence
 from datetime import datetime
 from pathlib import Path
 from time import sleep
 
 import cv2
 import numpy as np
-from pydantic import Field
-from rcs.camera.interface import (
-    BaseCameraConfig,
-    BaseCameraSetConfig,
-    Frame,
-    FrameSet,
-    SimpleFrameRate,
-)
-
-
-class HWCameraSetConfig(BaseCameraSetConfig):
-    cameras: dict[str, BaseCameraConfig] = Field(default={})
-    warm_up_disposal_frames: int = 30  # frames
-    record_path: str = "camera_frames"
-    max_buffer_frames: int = 1000
-
-
-# TODO(juelg): refactor camera thread into their own class, to avoid a base hardware camera set class
-# TODO(juelg): add video recording
-class BaseHardwareCameraSet(ABC):
-    """This base class should have the ability to poll in a separate thread for all cameras and store them in a buffer.
-    Implements BaseCameraSet
+from rcs._core.common import BaseCameraConfig
+from rcs.camera.interface import BaseCameraSet, Frame, FrameSet, SimpleFrameRate
+
+
+class HardwareCamera(typing.Protocol):
+    """Implementation of a hardware camera potentially a set of cameras of the same kind."""
+
+    def open(self):
+        """Should open the camera and prepare it for polling."""
+
+    def close(self):
+        """Should close the camera and release all resources."""
+
+    def config(self, camera_name: str) -> BaseCameraConfig:
+        """Should return the configuration object of the cameras."""
+
+    def poll_frame(self, camera_name: str) -> Frame:
+        """Should return the latest frame from the camera with the given name.
+
+        This method should be thread safe.
+        """
+
+    @property
+    def camera_names(self) -> list[str]:
+        """Returns the names of the cameras in this set."""
+
+
+class HardwareCameraSet(BaseCameraSet):
+    """This base class polls in a separate thread for all cameras and stores them in a buffer.
+
+    Cameras can consist of multiple cameras, e.g. RealSense cameras.
     """
 
-    def __init__(self):
-        self._buffer: list[FrameSet | None] = [None for _ in range(self.config.max_buffer_frames)]
+    def __init__(
+        self, cameras: Sequence[HardwareCamera], warm_up_disposal_frames: int = 30, max_buffer_frames: int = 1000
+    ):
+        self.cameras = cameras
+        self.camera_dict, self._camera_names = self._cameras_util()
+        self.frame_rate = self._frames_rate()
+        self.rate_limiter = SimpleFrameRate(self.frame_rate)
+
+        self.warm_up_disposal_frames = warm_up_disposal_frames
+        self.max_buffer_frames = max_buffer_frames
+        self._buffer: list[FrameSet | None] = [None for _ in range(self.max_buffer_frames)]
         self._buffer_lock = threading.Lock()
         self.running = False
         self._thread: threading.Thread | None = None
@@ -41,7 +59,42 @@ def __init__(self):
         self._next_ring_index = 0
         self._buffer_len = 0
         self.writer: dict[str, cv2.VideoWriter] = {}
-        self.rate = SimpleFrameRate()
+
+    @property
+    def camera_names(self) -> list[str]:
+        """Returns the names of the cameras in this set."""
+        return self._camera_names
+
+    @property
+    def name_to_identifier(self) -> dict[str, str]:
+        """Returns a dictionary mapping the camera names to their identifiers."""
+        name_to_id: dict[str, str] = {}
+        for camera in self.cameras:
+            for name in camera.camera_names:
+                name_to_id[name] = camera.config(name).identifier
+        return name_to_id
+
+    def _frames_rate(self) -> int:
+        """Checks if all cameras have the same frame rate."""
+        frame_rates = {camera.config(name).frame_rate for camera in self.cameras for name in camera.camera_names}
+        if len(frame_rates) > 1:
+            msg = "All cameras must have the same frame rate. Different frame rates are not supported."
+            raise ValueError(msg)
+        if len(frame_rates) == 0:
+            self._logger.warning("No camera found, empty polling with 1 fps.")
+            return 1
+        return next(iter(frame_rates))
+
+    def _cameras_util(self) -> tuple[dict[str, HardwareCamera], list[str]]:
+        """Utility function to create a dictionary of cameras and a list of camera names."""
+        camera_dict: dict[str, HardwareCamera] = {}
+        camera_names: list[str] = []
+        for camera in self.cameras:
+            camera_names.extend(camera.camera_names)
+            for name in camera.camera_names:
+                assert name not in camera_dict, f"Camera name {name} not unique."
+                camera_dict[name] = camera
+        return camera_dict, camera_names
 
     def buffer_size(self) -> int:
         return len(self._buffer) - self._buffer.count(None)
@@ -64,7 +117,7 @@ def get_timestamp_frames(self, ts: datetime) -> FrameSet | None:
         # iterate through the buffer and find the closest timestamp
         with self._buffer_lock:
             for i in range(self._buffer_len):
-                idx = (self._next_ring_index - i - 1) % self.config.max_buffer_frames  # iterate backwards
+                idx = (self._next_ring_index - i - 1) % self.max_buffer_frames  # iterate backwards
                 assert self._buffer[idx] is not None
                 item: FrameSet = typing.cast(FrameSet, self._buffer[idx])
                 assert item.avg_timestamp is not None
@@ -82,6 +135,8 @@ def stop(self):
     def close(self):
         if self.running and self._thread is not None:
             self.stop()
+        for camera in self.cameras:
+            camera.close()
         self.stop_video()
 
     def start(self, warm_up: bool = True):
@@ -101,8 +156,8 @@ def record_video(self, path: Path, str_id: str):
                 str(path / f"episode_{str_id}_{camera}.mp4"),
                 # migh require to install ffmpeg
                 cv2.VideoWriter_fourcc(*"mp4v"),  # type: ignore
-                self.config.frame_rate,
-                (self.config.resolution_width, self.config.resolution_height),
+                self.frame_rate,
+                (self.config(camera).resolution_width, self.config(camera).resolution_height),
             )
 
     def recording_ongoing(self) -> bool:
@@ -117,12 +172,14 @@ def stop_video(self):
                 self.writer = {}
 
     def warm_up(self):
-        for _ in range(self.config.warm_up_disposal_frames):
+        for _ in range(self.warm_up_disposal_frames):
             for camera_name in self.camera_names:
-                self._poll_frame(camera_name)
-            self.rate(self.config.frame_rate)
+                self.poll_frame(camera_name)
+            self.rate_limiter()
 
     def polling_thread(self, warm_up: bool = True):
+        for camera in self.cameras:
+            camera.open()
         if warm_up:
             self.warm_up()
         while self.running:
@@ -130,19 +187,20 @@ def polling_thread(self, warm_up: bool = True):
             # buffering
             with self._buffer_lock:
                 self._buffer[self._next_ring_index] = frame_set
-                self._next_ring_index = (self._next_ring_index + 1) % self.config.max_buffer_frames
-                self._buffer_len = max(self._buffer_len + 1, self.config.max_buffer_frames)
+                self._next_ring_index = (self._next_ring_index + 1) % self.max_buffer_frames
+                self._buffer_len = max(self._buffer_len + 1, self.max_buffer_frames)
             # video recording
             for camera_key, writer in self.writer.items():
                 if frame_set is not None:
                     writer.write(frame_set.frames[camera_key].camera.color.data[:, :, ::-1])
-            self.rate(self.config.frame_rate)
+            self.rate_limiter()
 
     def poll_frame_set(self) -> FrameSet:
         """Gather frames over all available cameras."""
         frames: dict[str, Frame] = {}
         for camera_name in self.camera_names:
-            frame = self._poll_frame(camera_name)
+            # callback
+            frame = self.poll_frame(camera_name)
             frames[camera_name] = frame
         # filter none
         timestamps: list[float] = [frame.avg_timestamp for frame in frames.values() if frame.avg_timestamp is not None]
@@ -151,29 +209,14 @@ def poll_frame_set(self) -> FrameSet:
     def clear_buffer(self):
         """Deletes all frames from the buffer."""
         with self._buffer_lock:
-            self._buffer = [None for _ in range(self.config.max_buffer_frames)]
+            self._buffer = [None for _ in range(self.max_buffer_frames)]
             self._next_ring_index = 0
             self._buffer_len = 0
         self.wait_for_frames()
 
-    @property
-    @abstractmethod
-    def config(self) -> HWCameraSetConfig:
-        """Should return the configuration object of the cameras."""
-
-    @abstractmethod
-    def _poll_frame(self, camera_name: str) -> Frame:
-        """Should return the latest frame from the camera with the given name.
+    def config(self, camera_name: str) -> BaseCameraConfig:
+        """Returns the configuration object of the cameras."""
+        return self.camera_dict[camera_name].config(camera_name)
 
-        This method should be thread safe.
-        """
-
-    @property
-    def camera_names(self) -> list[str]:
-        """Should return a list of the activated human readable names of the cameras."""
-        return list(self.config.cameras)
-
-    @property
-    def name_to_identifier(self) -> dict[str, str]:
-        # return {key: camera.identifier for key, camera in self._cfg.cameras.items()}
-        return self.config.name_to_identifier
+    def poll_frame(self, camera_name: str) -> Frame:
+        return self.camera_dict[camera_name].poll_frame(camera_name)
diff --git a/python/rcs/camera/interface.py b/python/rcs/camera/interface.py
index d667ac2b..23a595eb 100644
--- a/python/rcs/camera/interface.py
+++ b/python/rcs/camera/interface.py
@@ -4,27 +4,31 @@
 from time import sleep, time
 from typing import Any, Protocol
 
+from rcs._core.common import BaseCameraConfig
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
 
 class SimpleFrameRate:
-    def __init__(self):
-        self.t = None
-        self._last_print = None
+    def __init__(self, frame_rate: int | float):
+        self.t: float | None = None
+        self._last_print: float | None = None
+        self.frame_rate = frame_rate
 
     def reset(self):
         self.t = None
 
-    def __call__(self, frame_rate: int | float):
+    def __call__(self):
         if self.t is None:
             self.t = time()
             self._last_print = self.t
-            sleep(1 / frame_rate if isinstance(frame_rate, int) else frame_rate)
+            sleep(1 / self.frame_rate if isinstance(self.frame_rate, int) else self.frame_rate)
             return
         sleep_time = (
-            1 / frame_rate - (time() - self.t) if isinstance(frame_rate, int) else frame_rate - (time() - self.t)
+            1 / self.frame_rate - (time() - self.t)
+            if isinstance(self.frame_rate, int)
+            else self.frame_rate - (time() - self.t)
         )
         if sleep_time > 0:
             sleep(sleep_time)
@@ -35,18 +39,6 @@ def __call__(self, frame_rate: int | float):
         self.t = time()
 
 
-@dataclass(kw_only=True)
-class BaseCameraConfig:
-    identifier: str
-    resolution_width: int = 1280  # pixels
-    resolution_height: int = 720  # pixels
-    frame_rate: int = 15  # Hz
-
-    @property
-    def name_to_identifier(self):
-        return {key: camera.identifier for key, camera in self.cameras.items()}
-
-
 @dataclass(kw_only=True)
 class DataFrame:
     data: Any
@@ -100,9 +92,8 @@ def clear_buffer(self):
     def close(self):
         """Stops any running threads e.g. for exitting."""
 
-    @property
-    def config(self) -> BaseCameraSetConfig:
-        """Return the configuration object of the cameras."""
+    def config(self, camera_name: str) -> BaseCameraConfig:
+        """Returns the configuration object of the cameras."""
 
     @property
     def camera_names(self) -> list[str]:
diff --git a/python/rcs/camera/kinect.py b/python/rcs/camera/kinect.py
index bfe9c095..ad213be4 100644
--- a/python/rcs/camera/kinect.py
+++ b/python/rcs/camera/kinect.py
@@ -1,18 +1,25 @@
 import k4a
 import numpy as np
-from rcs.camera.hw import BaseHardwareCameraSet, HWCameraSetConfig
+from rcs._core.common import BaseCameraConfig
+from rcs.camera.hw import HardwareCamera
 from rcs.camera.interface import CameraFrame, DataFrame, Frame, IMUFrame
 
 
-class KinectConfig(HWCameraSetConfig):
-    include_imu: bool = False
-    timeout_ms: int = 2000
+class KinectCamera(HardwareCamera):
+    # ATTENTION: this code is untested
+    def __init__(self, cameras: dict[str, BaseCameraConfig], include_imu: bool = False, timeout_ms: int = 2000) -> None:
+        self.include_imu = include_imu
+        self.timeout_ms = timeout_ms
+        self.cameras = cameras
+        assert len(self.cameras) == 1, "Kinect only supports one camera."
+        self._camera_names = list(self.cameras.keys())
 
+    @property
+    def camera_names(self) -> list[str]:
+        """Returns the names of the cameras in this set."""
+        return self._camera_names
 
-class KinectCamera(BaseHardwareCameraSet):
-    def __init__(self, cfg: KinectConfig) -> None:
-        super().__init__()
-        self._cfg = cfg
+    def open(self):
         self._device = k4a.Device.open()
         device_config = k4a.DEVICE_CONFIG_BGRA32_1080P_NFOV_2X2BINNED_FPS15
         if self._device is None:
@@ -21,21 +28,13 @@ def __init__(self, cfg: KinectConfig) -> None:
         if self._device.start_cameras(device_config) != k4a.EStatus.SUCCEEDED:
             msg = "Failed to start cameras."
             raise OSError(msg)
-        if self._cfg.include_imu and self._device.start_imu() != k4a.EWaitStatus.SUCCEEDED:
+        if self.include_imu and self._device.start_imu() != k4a.EWaitStatus.SUCCEEDED:
             msg = "Failed to start IMU."
             raise OSError(msg)
 
-    @property
-    def config(self) -> KinectConfig:
-        return self._cfg
-
-    @config.setter
-    def config(self, cfg: KinectConfig) -> None:
-        self._cfg = cfg
-
-    def _poll_frame(self, camera_name: str = "") -> Frame:
+    def poll_frame(self, camera_name: str = "") -> Frame:
         assert camera_name == "kinect", "Kinect code only supports one camera."
-        capture = self._device.get_capture(self._cfg.timeout_ms)
+        capture = self._device.get_capture(self.timeout_ms)
         if capture is None:
             msg = "Timeout error while waiting for camera sample."
             raise OSError(msg)
@@ -44,8 +43,8 @@ def _poll_frame(self, camera_name: str = "") -> Frame:
             color=capture.color.data, ir=capture.ir.data, depth=capture.depth.data, temperature=capture.temperature
         )
         imu_frame = None
-        if self._cfg.include_imu:
-            imu_sample = self._device.get_imu_sample(self._cfg.timeout_ms)
+        if self.include_imu:
+            imu_sample = self._device.get_imu_sample(self.timeout_ms)
             if imu_sample is None:
                 msg = "Timeout error while waiting for IMU sample."
                 raise OSError(msg)
@@ -57,3 +56,13 @@ def _poll_frame(self, camera_name: str = "") -> Frame:
                 temperature=imu_sample.temperature,
             )
         return Frame(camera=camera_frame, imu=imu_frame, avg_timestamp=None)
+
+    def config(self, camera_name) -> BaseCameraConfig:
+        return self.cameras[camera_name]
+
+    def close(self):
+        if self._device is not None:
+            self._device.stop_cameras()
+            if self.include_imu:
+                self._device.stop_imu()
+            self._device = None
diff --git a/python/rcs/camera/realsense.py b/python/rcs/camera/realsense.py
index 1f0fac63..28b89a1b 100644
--- a/python/rcs/camera/realsense.py
+++ b/python/rcs/camera/realsense.py
@@ -1,23 +1,12 @@
+import logging
 from dataclasses import dataclass
 
 import numpy as np
 import pyrealsense2 as rs
-from rcs.camera.hw import BaseHardwareCameraSet, HWCameraSetConfig
+from rcs._core.common import BaseCameraConfig
+from rcs.camera.hw import HardwareCamera
 from rcs.camera.interface import CameraFrame, DataFrame, Frame, IMUFrame
 
-# class RealSenseConfig(BaseCameraConfig):
-# dict with readable name and serial number
-
-
-class RealSenseSetConfig(HWCameraSetConfig):
-    # dict with readable name and serial number
-    # devices_to_enable: dict[str, str] = {}
-    # cameras: dict[str, RealSenseConfig] = []
-    enable_ir_emitter: bool = False
-    enable_ir: bool = False
-    laser_power: int = 330
-    enable_imu: bool = False
-
 
 @dataclass
 class RealSenseDeviceInfo:
@@ -32,39 +21,58 @@ class RealSenseDevicePipeline:
     camera: RealSenseDeviceInfo
 
 
-# TODO(juelg): look at frame queue
-class RealSenseCameraSet(BaseHardwareCameraSet):
+class RealSenseCameraSet(HardwareCamera):
     TIMESTAMP_FACTOR = 1e-3
 
-    def __init__(self, cfg: RealSenseSetConfig) -> None:
-        self._cfg = cfg
-        super().__init__()
-        self.D400_config = rs.config()
+    def __init__(
+        self,
+        cameras: dict[str, BaseCameraConfig],
+        enable_ir_emitter: bool = False,
+        enable_ir: bool = False,
+        laser_power: int = 330,
+        enable_imu: bool = False,
+    ) -> None:
+        self.enable_ir_emitter = enable_ir_emitter
+        self.enable_ir = enable_ir
+        self.laser_power = laser_power
+        self.enable_imu = enable_imu
+        self.cameras = cameras
+        self._logger = logging.getLogger(__name__)
+        assert (
+            len({camera.resolution_width for camera in self.cameras.values()}) == 1
+            and len({camera.resolution_height for camera in self.cameras.values()}) == 1
+            and len({camera.frame_rate for camera in self.cameras.values()}) == 1
+        ), "All cameras must have the same resolution and frame rate."
+        sample_camera_config = next(iter(self.cameras.values()))
+        self.resolution_width = sample_camera_config.resolution_width
+        self.resolution_height = sample_camera_config.resolution_height
+        self.frame_rate = sample_camera_config.frame_rate
 
+        self.D400_config = rs.config()
         self.D400_config.enable_stream(
             rs.stream.depth,
-            self._cfg.resolution_width,
-            self._cfg.resolution_height,
+            self.resolution_width,
+            self.resolution_height,
             rs.format.z16,
-            self._cfg.frame_rate,
+            self.frame_rate,
         )
         self.D400_config.enable_stream(
             rs.stream.color,
-            self._cfg.resolution_width,
-            self._cfg.resolution_height,
+            self.resolution_width,
+            self.resolution_height,
             rs.format.bgr8,
-            self._cfg.frame_rate,
+            self.frame_rate,
         )
-        if self._cfg.enable_ir:
+        if self.enable_ir:
             self.D400_config.enable_stream(
                 rs.stream.infrared,
                 1,
-                self._cfg.resolution_width,
-                self._cfg.resolution_height,
+                self.resolution_width,
+                self.resolution_height,
                 rs.format.y8,
-                self._cfg.frame_rate,
+                self.frame_rate,
             )
-        if self._cfg.enable_imu:
+        if self.enable_imu:
             # TODO(juelg): does not work work at the moment: "Couldnt resolve requests"
             # https://www.intelrealsense.com/how-to-getting-imu-data-from-d435i-and-t265/
             # Accelerometer available FPS: {63, 250}Hz
@@ -79,15 +87,26 @@ def __init__(self, cfg: RealSenseSetConfig) -> None:
                 rs.format.motion_xyz32f,
                 200,
             )
-
-        self._context = rs.context()
         self._available_devices: dict[str, RealSenseDeviceInfo] = {}
-        self.update_available_devices()
         self._enabled_devices: dict[str, RealSenseDevicePipeline] = {}  # serial numbers of te enabled devices
-        self.enable_devices(self.name_to_identifier, self._cfg.enable_ir_emitter)
+        self._camera_names = list(self.cameras.keys())
+
+    @property
+    def camera_names(self) -> list[str]:
+        """Returns the names of the cameras in this set."""
+        return self._camera_names
+
+    def config(self, camera_name) -> BaseCameraConfig:
+        return self.cameras[camera_name]
+
+    def open(self):
+        self._available_devices = {}
+        self.update_available_devices()
+        self._enabled_devices = {}  # serial numbers of te enabled devices
+        self.enable_devices({key: value.identifier for key, value in self.cameras.items()}, self.enable_ir_emitter)
 
     def update_available_devices(self):
-        self._available_devices = self.enumerate_connected_devices(self._context)
+        self._available_devices = self.enumerate_connected_devices(rs.context())
 
     def enable_all_devices(self, enable_ir_emitter: bool = False):
         """
@@ -143,18 +162,10 @@ def enable_device(self, device_info: RealSenseDeviceInfo, enable_ir_emitter: boo
         sensor = pipeline_profile.get_device().first_depth_sensor()
         if sensor.supports(rs.option.emitter_enabled):
             sensor.set_option(rs.option.emitter_enabled, 1 if enable_ir_emitter else 0)
-            sensor.set_option(rs.option.laser_power, self._cfg.laser_power)
+            sensor.set_option(rs.option.laser_power, self.laser_power)
         self._enabled_devices[device_info.serial] = RealSenseDevicePipeline(pipeline, pipeline_profile, device_info)
         self._logger.debug("Enabled device %s (%s)", device_info.serial, device_info.product_line)
 
-    @property
-    def config(self) -> RealSenseSetConfig:
-        return self._cfg
-
-    @config.setter
-    def config(self, cfg: RealSenseSetConfig) -> None:
-        self._cfg = cfg
-
     @staticmethod
     def enumerate_connected_devices(context: rs.context) -> dict[str, RealSenseDeviceInfo]:
         """
@@ -182,7 +193,7 @@ def enumerate_connected_devices(context: rs.context) -> dict[str, RealSenseDevic
                 connect_device[serial] = device_info
         return connect_device
 
-    def _poll_frame(self, camera_name: str) -> Frame:
+    def poll_frame(self, camera_name: str) -> Frame:
         # TODO(juelg): polling should be performed in a recorder thread
         # (anyway needed to record trajectory data to disk)
         # and this method should just access the latest frame from the recorder
@@ -190,8 +201,8 @@ def _poll_frame(self, camera_name: str) -> Frame:
 
         # TODO(juelg): what is a "pose" (in frame) and how can we use it
         # TODO(juelg): decide whether to use the poll method and to wait if devices get ready
-        assert camera_name in self.name_to_identifier, f"Camera {camera_name} not found in the enabled devices"
-        serial = self.name_to_identifier[camera_name]
+        assert camera_name in self.camera_names, f"Camera {camera_name} not found in the enabled devices"
+        serial = self.cameras[camera_name].identifier
         device = self._enabled_devices[serial]
 
         streams = device.pipeline_profile.get_streams()
@@ -317,7 +328,6 @@ def disable_streams(self):
         self.D400_config.disable_all_streams()
 
     def close(self):
-        super().close()
         for device in self._enabled_devices.values():
             device.pipeline.stop()
         self.disable_streams()
@@ -334,21 +344,4 @@ def enable_emitter(self, enable_ir_emitter=True):
                 continue
             sensor.set_option(rs.option.emitter_enabled, 1 if enable_ir_emitter else 0)
             if enable_ir_emitter:
-                sensor.set_option(rs.option.laser_power, self._cfg.laser_power)
-
-    def load_settings_json(self, path_to_settings_file):
-        """
-        Load the settings stored in the JSON file
-
-        """
-
-        with open(path_to_settings_file, "r") as file:
-            json_text = file.read().strip()
-
-        for device in self._enabled_devices.values():
-            if device.camera.product_line != "D400":
-                continue
-            # Get the active profile and load the json file which contains settings readable by the realsense
-            dev = device.pipeline_profile.get_device()
-            advanced_mode = rs.rs400_advanced_mode(dev)
-            advanced_mode.load_json(json_text)
+                sensor.set_option(rs.option.laser_power, self.laser_power)
diff --git a/python/rcs/camera/sim.py b/python/rcs/camera/sim.py
index 2ec82c27..8ea38c67 100644
--- a/python/rcs/camera/sim.py
+++ b/python/rcs/camera/sim.py
@@ -3,30 +3,12 @@
 
 import numpy as np
 import rcs
-from pydantic import Field
-from rcs._core.sim import CameraType
+
+# from rcs._core.common import BaseCameraConfig
 from rcs._core.sim import FrameSet as _FrameSet
-from rcs._core.sim import SimCameraConfig as _SimCameraConfig
+from rcs._core.sim import SimCameraConfig
 from rcs._core.sim import SimCameraSet as _SimCameraSet
-from rcs._core.sim import SimCameraSetConfig as _SimCameraSetConfig
-from rcs.camera.interface import (
-    BaseCameraConfig,
-    BaseCameraSetConfig,
-    CameraFrame,
-    DataFrame,
-    Frame,
-    FrameSet,
-)
-
-
-class SimCameraConfig(BaseCameraConfig):
-    type: int  # CamBaseCameraConfigeraType
-
-
-class SimCameraSetConfig(BaseCameraSetConfig):
-    cameras: dict[str, SimCameraConfig] = Field(default={})
-    max_buffer_frames: int = 1000
-    physical_units: bool = False
+from rcs.camera.interface import CameraFrame, DataFrame, Frame, FrameSet
 
 
 class SimCameraSet(_SimCameraSet):
@@ -34,34 +16,18 @@ class SimCameraSet(_SimCameraSet):
     Implements BaseCameraSet
     """
 
-    def __init__(self, sim: rcs.sim.Sim, cfg: SimCameraSetConfig):
+    def __init__(
+        self,
+        sim: rcs.sim.Sim,
+        cameras: dict[str, SimCameraConfig],
+        physical_units: bool = False,
+        render_on_demand: bool = True,
+    ):
         self._logger = logging.getLogger(__name__)
-        self._cfg = cfg
-        cameras: dict[str, _SimCameraConfig] = {}
-
-        def get_type(t):
-            if t == CameraType.fixed:
-                return CameraType.fixed
-            if t == CameraType.tracking:
-                return CameraType.tracking
-            if t == CameraType.free:
-                return CameraType.free
-            return CameraType.default_free
+        self.cameras = cameras
+        self.physical_units = physical_units
 
-        for name, camera_cfg in cfg.cameras.items():
-            cpp_camera_cfg = _SimCameraConfig()
-            cpp_camera_cfg.type = get_type(camera_cfg.type)
-            cpp_camera_cfg.identifier = camera_cfg.identifier
-            cameras[name] = cpp_camera_cfg
-
-        cpp_set_cfg = _SimCameraSetConfig()
-        cpp_set_cfg.cameras = cameras
-        cpp_set_cfg.resolution_width = cfg.resolution_width
-        cpp_set_cfg.resolution_height = cfg.resolution_height
-        cpp_set_cfg.frame_rate = cfg.frame_rate
-        cpp_set_cfg.max_buffer_frames = cfg.max_buffer_frames
-
-        super().__init__(sim, cpp_set_cfg)
+        super().__init__(sim, cameras, render_on_demand=render_on_demand)
         self._sim: rcs.sim.Sim
 
     def get_latest_frames(self) -> FrameSet | None:
@@ -80,15 +46,19 @@ def _cpp_to_python_frames(self, cpp_frameset: _FrameSet | None) -> FrameSet | No
         d_frames_iter = cpp_frameset.depth_frames.items()
         for (color_name, color_frame), (depth_name, depth_frame) in zip(c_frames_iter, d_frames_iter, strict=True):
             assert color_name == depth_name
-            color_np_frame = np.copy(color_frame).reshape(self._cfg.resolution_height, self._cfg.resolution_width, 3)[
+            color_np_frame = np.copy(color_frame).reshape(
+                self.cameras[color_name].resolution_height, self.cameras[color_name].resolution_width, 3
+            )[
                 # convert from column-major (c++ eigen) to row-major (python numpy)
                 ::-1
             ]
-            depth_np_frame = np.copy(depth_frame).reshape(self._cfg.resolution_height, self._cfg.resolution_width, 1)[
+            depth_np_frame = np.copy(depth_frame).reshape(
+                self.cameras[depth_name].resolution_height, self.cameras[depth_name].resolution_width, 1
+            )[
                 # convert from column-major (c++ eigen) to row-major (python numpy)
                 ::-1
             ]
-            if self._cfg.physical_units:
+            if self.physical_units:
                 # Convert from [0 1] to depth in meters, see links below:
                 # http://stackoverflow.com/a/6657284/1461210
                 # https://www.khronos.org/opengl/wiki/Depth_Buffer_Precision
@@ -105,19 +75,19 @@ def _cpp_to_python_frames(self, cpp_frameset: _FrameSet | None) -> FrameSet | No
             frames[color_name] = frame
         return FrameSet(frames=frames, avg_timestamp=cpp_frameset.timestamp)
 
+    def config(self, camera_name: str) -> SimCameraConfig:
+        """Should return the configuration of the camera with the given name."""
+        return self.cameras[camera_name]
+
     def close(self):
         # TODO: this could deregister camera callbacks in simulation
         pass
 
-    @property
-    def config(self) -> SimCameraSetConfig:
-        return self._cfg
-
     @property
     def camera_names(self) -> list[str]:
         """Should return a list of the activated human readable names of the cameras."""
-        return list(self._cfg.cameras.keys())
+        return list(self.cameras.keys())
 
     @property
     def name_to_identifier(self) -> dict[str, str]:
-        return self._cfg.name_to_identifier
+        return {name: cfg.identifier for name, cfg in self.cameras.items()}
diff --git a/python/rcs/envs/base.py b/python/rcs/envs/base.py
index 7733ec41..18451da2 100644
--- a/python/rcs/envs/base.py
+++ b/python/rcs/envs/base.py
@@ -587,18 +587,19 @@ def __init__(self, env, camera_set: BaseCameraSet, include_depth: bool = False):
         self.observation_space: gym.spaces.Dict
         # rgb is always included
         params: dict = {
-            "frame": {
-                "height": camera_set.config.resolution_height,
-                "width": camera_set.config.resolution_width,
+            f"/{name}/{self.RGB_KEY}/frame": {
+                "height": camera_set.config(name).resolution_height,
+                "width": camera_set.config(name).resolution_width,
             }
+            for name in camera_set.camera_names
         }
         if self.include_depth:
             # depth is optional
             params.update(
                 {
                     f"/{name}/{self.DEPTH_KEY}/frame": {
-                        "height": camera_set.config.resolution_height,
-                        "width": camera_set.config.resolution_width,
+                        "height": camera_set.config(name).resolution_height,
+                        "width": camera_set.config(name).resolution_width,
                         "color_dim": 1,
                         "dtype": np.float32,
                         "low": 0.0,
@@ -663,54 +664,6 @@ def close(self):
         super().close()
 
 
-class DigitCameraSetWrapper(CameraSetWrapper):
-    """Wrapper for digit cameras."""
-
-    def __init__(self, env, camera_set: BaseCameraSet):
-        super().__init__(env, camera_set)
-        # self.unwrapped: FR3Env
-        self.camera_set = camera_set
-        self.observation_space: gym.spaces.Dict
-        # rgb is always included
-        params: dict = {
-            "digit_frames": {
-                "height": camera_set.config.resolution_height,
-                "width": camera_set.config.resolution_width,
-            }
-        }
-
-        self.observation_space.spaces.update(
-            get_space(
-                DigitCameraDictType,
-                child_dict_keys_to_unfold={
-                    "camera_names": camera_set.camera_names,
-                },
-                params=params,
-            ).spaces
-        )
-        self.camera_key = get_space_keys(DigitCameraDictType)[0]
-
-    def observation(self, observation: dict, info: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]:
-        observation = copy.deepcopy(observation)
-        info = copy.deepcopy(info)
-        frameset = self.camera_set.get_latest_frames()
-        if frameset is None:
-            observation[self.camera_key] = {}
-            info["digit_available"] = False
-            return observation, info
-
-        frame_dict: dict[str, np.ndarray] = {
-            camera_name: frame.camera.color.data for camera_name, frame in frameset.frames.items()
-        }
-        observation[self.camera_key] = frame_dict
-
-        info["digit_available"] = True
-        if frameset.avg_timestamp is not None:
-            info["digit_timestamp"] = frameset.avg_timestamp
-
-        return observation, info
-
-
 class GripperWrapper(ActObsInfoWrapper):
     # TODO: sticky gripper, like in aloha
 
diff --git a/python/rcs/envs/creators.py b/python/rcs/envs/creators.py
index 8aafc6eb..2c8c45c8 100644
--- a/python/rcs/envs/creators.py
+++ b/python/rcs/envs/creators.py
@@ -9,13 +9,11 @@
 from gymnasium.envs.registration import EnvCreator
 from rcs import sim
 from rcs._core.sim import CameraType
-from rcs.camera.digit_cam import DigitCam
-from rcs.camera.hw import BaseHardwareCameraSet
-from rcs.camera.sim import SimCameraConfig, SimCameraSet, SimCameraSetConfig
+from rcs.camera.hw import HardwareCameraSet
+from rcs.camera.sim import SimCameraConfig, SimCameraSet
 from rcs.envs.base import (
     CameraSetWrapper,
     ControlMode,
-    DigitCameraSetWrapper,
     GripperWrapper,
     HandWrapper,
     MultiRobotWrapper,
@@ -60,8 +58,7 @@ def __call__(  # type: ignore
         robot_cfg: rcs.hw.FR3Config,
         collision_guard: str | PathLike | None = None,
         gripper_cfg: rcs.hw.FHConfig | rcs.hand.tilburg_hand.THConfig | None = None,
-        camera_set: BaseHardwareCameraSet | None = None,
-        digit_set: DigitCam | None = None,
+        camera_set: HardwareCameraSet | None = None,
         max_relative_movement: float | tuple[float, float] | None = None,
         relative_to: RelativeTo = RelativeTo.LAST_STEP,
         urdf_path: str | PathLike | None = None,
@@ -109,12 +106,6 @@ def __call__(  # type: ignore
             logger.info("CameraSet started")
             env = CameraSetWrapper(env, camera_set)
 
-        if digit_set is not None:
-            digit_set.start()
-            digit_set.wait_for_frames()
-            logger.info("DigitCameraSet started")
-            env = DigitCameraSetWrapper(env, digit_set)
-
         if collision_guard is not None:
             assert urdf_path is not None
             env = CollisionGuard.env_from_xml_paths(
@@ -140,7 +131,7 @@ def __call__(  # type: ignore
         control_mode: ControlMode,
         robot_cfg: rcs.hw.FR3Config,
         gripper_cfg: rcs.hw.FHConfig | None = None,
-        camera_set: BaseHardwareCameraSet | None = None,
+        camera_set: HardwareCameraSet | None = None,
         max_relative_movement: float | tuple[float, float] | None = None,
         relative_to: RelativeTo = RelativeTo.LAST_STEP,
         urdf_path: str | PathLike | None = None,
@@ -183,7 +174,9 @@ def __call__(  # type: ignore
         camera_config: dict[str, str] | None = None,
         gripper: bool = True,
     ) -> gym.Env:
-        camera_set = default_realsense(camera_config)
+        real_sense = default_realsense(camera_config)
+        camera_set_impl_list = [] if real_sense is None else [real_sense]
+        camera_set = HardwareCameraSet(cameras=camera_set_impl_list)
         return RCSFR3EnvCreator()(
             ip=robot_ip,
             camera_set=camera_set,
@@ -203,7 +196,7 @@ def __call__(  # type: ignore
         robot_cfg: rcs.sim.SimRobotConfig,
         collision_guard: bool = False,
         gripper_cfg: rcs.sim.SimGripperConfig | None = None,
-        camera_set_cfg: SimCameraSetConfig | None = None,
+        cameras: dict[str, SimCameraConfig] | None = None,
         max_relative_movement: float | tuple[float, float] | None = None,
         relative_to: RelativeTo = RelativeTo.LAST_STEP,
         urdf_path: str | PathLike | None = None,
@@ -244,8 +237,8 @@ def __call__(  # type: ignore
         env: gym.Env = RobotEnv(robot, control_mode)
         env = RobotSimWrapper(env, simulation, sim_wrapper)
 
-        if camera_set_cfg is not None:
-            camera_set = SimCameraSet(simulation, camera_set_cfg)
+        if cameras is not None:
+            camera_set = SimCameraSet(simulation, cameras, physical_units=True, render_on_demand=True)
             env = CameraSetWrapper(env, camera_set, include_depth=True)
 
         if gripper_cfg is not None and isinstance(gripper_cfg, rcs.sim.SimGripperConfig):
@@ -278,7 +271,7 @@ def __call__(  # type: ignore
         render_mode: str = "human",
         control_mode: ControlMode = ControlMode.CARTESIAN_TRPY,
         delta_actions: bool = True,
-        camera_cfg: SimCameraSetConfig | None = None,
+        cameras: dict[str, SimCameraConfig] | None = None,
     ) -> gym.Env:
 
         env_rel = RCSSimEnvCreator()(
@@ -286,7 +279,7 @@ def __call__(  # type: ignore
             robot_cfg=default_fr3_sim_robot_cfg(mjcf),
             collision_guard=False,
             gripper_cfg=default_fr3_sim_gripper_cfg(),
-            camera_set_cfg=camera_cfg,
+            cameras=cameras,
             max_relative_movement=(0.2, np.deg2rad(45)) if delta_actions else None,
             relative_to=RelativeTo.LAST_STEP,
             mjcf=mjcf,
@@ -312,22 +305,51 @@ def __call__(  # type: ignore
             resolution = (256, 256)
 
         cameras = {
-            "wrist": SimCameraConfig(identifier="wrist_0", type=int(CameraType.fixed)),
-            "bird_eye": SimCameraConfig(identifier="bird_eye_cam", type=int(CameraType.fixed)),
-            "side": SimCameraConfig(identifier="side_view", type=int(CameraType.fixed)),
-            "right_side": SimCameraConfig(identifier="right_side", type=int(CameraType.fixed)),
-            "left_side": SimCameraConfig(identifier="left_side", type=int(CameraType.fixed)),
-            "front": SimCameraConfig(identifier="front", type=int(CameraType.fixed)),
+            "wrist": SimCameraConfig(
+                identifier="wrist_0",
+                type=CameraType.fixed,
+                resolution_height=resolution[1],
+                resolution_width=resolution[0],
+                frame_rate=frame_rate,
+            ),
+            "bird_eye": SimCameraConfig(
+                identifier="bird_eye_cam",
+                type=CameraType.fixed,
+                resolution_height=resolution[1],
+                resolution_width=resolution[0],
+                frame_rate=frame_rate,
+            ),
+            "side": SimCameraConfig(
+                identifier="side_view",
+                type=CameraType.fixed,
+                resolution_height=resolution[1],
+                resolution_width=resolution[0],
+                frame_rate=frame_rate,
+            ),
+            "right_side": SimCameraConfig(
+                identifier="right_side",
+                type=CameraType.fixed,
+                resolution_height=resolution[1],
+                resolution_width=resolution[0],
+                frame_rate=frame_rate,
+            ),
+            "left_side": SimCameraConfig(
+                identifier="left_side",
+                type=CameraType.fixed,
+                resolution_height=resolution[1],
+                resolution_width=resolution[0],
+                frame_rate=frame_rate,
+            ),
+            "front": SimCameraConfig(
+                identifier="front",
+                type=CameraType.fixed,
+                resolution_height=resolution[1],
+                resolution_width=resolution[0],
+                frame_rate=frame_rate,
+            ),
         }
 
-        camera_cfg = SimCameraSetConfig(
-            cameras=cameras,
-            resolution_width=resolution[0],
-            resolution_height=resolution[1],
-            frame_rate=frame_rate,
-            physical_units=True,
-        )
-        return SimTaskEnvCreator()("fr3_simple_pick_up", render_mode, control_mode, delta_actions, camera_cfg)
+        return SimTaskEnvCreator()("fr3_simple_pick_up", render_mode, control_mode, delta_actions, cameras)
 
 
 class FR3SimplePickUpSimDigitHandEnvCreator(EnvCreator):
@@ -342,18 +364,17 @@ def __call__(  # type: ignore
         if resolution is None:
             resolution = (256, 256)
 
-        cameras = {"wrist": SimCameraConfig(identifier="wrist_0", type=int(CameraType.fixed))}
+        cameras = {
+            "wrist": SimCameraConfig(
+                identifier="wrist_0",
+                type=CameraType.fixed,
+                resolution_height=resolution[1],
+                resolution_width=resolution[0],
+                frame_rate=frame_rate,
+            )
+        }
 
-        camera_cfg = SimCameraSetConfig(
-            cameras=cameras,
-            resolution_width=resolution[0],
-            resolution_height=resolution[1],
-            frame_rate=frame_rate,
-            physical_units=True,
-        )
-        return SimTaskEnvCreator()(
-            "fr3_simple_pick_up_digit_hand", render_mode, control_mode, delta_actions, camera_cfg
-        )
+        return SimTaskEnvCreator()("fr3_simple_pick_up_digit_hand", render_mode, control_mode, delta_actions, cameras)
 
 
 class FR3LabPickUpSimDigitHandEnvCreator(EnvCreator):
@@ -370,22 +391,27 @@ def __call__(  # type: ignore
             resolution = (256, 256)
 
         cameras = {
-            "wrist": SimCameraConfig(identifier="wrist_0", type=int(CameraType.fixed)),
-            "side": SimCameraConfig(identifier="wrist_1", type=int(CameraType.fixed)),
+            "wrist": SimCameraConfig(
+                identifier="wrist_0",
+                type=CameraType.fixed,
+                resolution_height=resolution[1],
+                resolution_width=resolution[0],
+                frame_rate=frame_rate,
+            ),
+            "side": SimCameraConfig(
+                identifier="wrist_1",
+                type=CameraType.fixed,
+                resolution_height=resolution[1],
+                resolution_width=resolution[0],
+                frame_rate=frame_rate,
+            ),
         }
 
-        camera_cfg = SimCameraSetConfig(
-            cameras=cameras,
-            resolution_width=resolution[0],
-            resolution_height=resolution[1],
-            frame_rate=frame_rate,
-            physical_units=True,
-        )
         env_rel = SimTaskEnvCreator()(
             "lab_simple_pick_up_digit_hand",
             render_mode,
             control_mode,
             delta_actions,
-            camera_cfg,
+            cameras,
         )
         return CamRobot(env_rel, cam_robot_joints, "lab_simple_pick_up_digit_hand")
diff --git a/python/rcs/envs/utils.py b/python/rcs/envs/utils.py
index 3e9bb9b0..940ceb56 100644
--- a/python/rcs/envs/utils.py
+++ b/python/rcs/envs/utils.py
@@ -7,12 +7,11 @@
 import rcs
 from digit_interface import Digit
 from rcs import sim
+from rcs._core.common import BaseCameraConfig
 from rcs._core.hw import FR3Config, IKSolver
-from rcs._core.sim import CameraType
-from rcs.camera.digit_cam import DigitCam, DigitConfig
-from rcs.camera.interface import BaseCameraConfig
-from rcs.camera.realsense import RealSenseCameraSet, RealSenseSetConfig
-from rcs.camera.sim import SimCameraConfig, SimCameraSetConfig
+from rcs._core.sim import CameraType, SimCameraConfig
+from rcs.camera.digit_cam import DigitCam
+from rcs.camera.realsense import RealSenseCameraSet
 from rcs.hand.tilburg_hand import THConfig
 
 logger = logging.getLogger(__name__)
@@ -56,17 +55,11 @@ def default_tilburg_hw_hand_cfg(file: str | PathLike | None = None) -> THConfig:
 def default_realsense(name2id: dict[str, str] | None) -> RealSenseCameraSet | None:
     if name2id is None:
         return None
-    cameras = {name: BaseCameraConfig(identifier=id) for name, id in name2id.items()}
-    cam_cfg = RealSenseSetConfig(
-        cameras=cameras,
-        resolution_width=1280,
-        resolution_height=720,
-        frame_rate=15,
-        enable_imu=False,  # does not work with imu, why?
-        enable_ir=True,
-        enable_ir_emitter=False,
-    )
-    return RealSenseCameraSet(cam_cfg)
+    cameras = {
+        name: BaseCameraConfig(identifier=id, resolution_width=1280, resolution_height=720, frame_rate=30)
+        for name, id in name2id.items()
+    }
+    return RealSenseCameraSet(cameras=cameras)
 
 
 def default_fr3_sim_gripper_cfg(idx: str = "0") -> sim.SimGripperConfig:
@@ -78,28 +71,30 @@ def default_fr3_sim_gripper_cfg(idx: str = "0") -> sim.SimGripperConfig:
 def default_digit(name2id: dict[str, str] | None, stream_name: str = "QVGA") -> DigitCam | None:
     if name2id is None:
         return None
-    cameras = {name: BaseCameraConfig(identifier=id) for name, id in name2id.items()}
     stream_dict = Digit.STREAMS[stream_name]
-    digit_cfg = DigitConfig(
-        cameras=cameras,
-        resolution_height=stream_dict["resolution"]["height"],
-        resolution_width=stream_dict["resolution"]["width"],
-        stream_name=stream_name,
-        frame_rate=stream_dict["fps"]["30fps"],
-    )
-    return DigitCam(digit_cfg)
-
-
-def default_mujoco_cameraset_cfg() -> SimCameraSetConfig:
     cameras = {
-        "wrist": SimCameraConfig(identifier="wrist_0", type=int(CameraType.fixed)),
-        "default_free": SimCameraConfig(identifier="", type=int(CameraType.default_free)),
-        # "bird_eye": SimCameraConfig(identifier="bird_eye_cam", type=int(CameraType.fixed)),
+        name: BaseCameraConfig(
+            identifier=id,
+            resolution_width=stream_dict["resolution"]["width"],
+            resolution_height=stream_dict["resolution"]["height"],
+            frame_rate=stream_dict["fps"]["30fps"],
+        )
+        for name, id in name2id.items()
     }
+    return DigitCam(cameras=cameras)
+
+
+def default_mujoco_cameraset_cfg() -> dict[str, SimCameraConfig]:
     # 256x256 needed for VLAs
-    return SimCameraSetConfig(
-        cameras=cameras, resolution_width=256, resolution_height=256, frame_rate=10, physical_units=True
-    )
+    return {
+        "wrist": SimCameraConfig(
+            identifier="wrist_0", type=CameraType.fixed, frame_rate=10, resolution_width=256, resolution_height=256
+        ),
+        "default_free": SimCameraConfig(
+            identifier="", type=CameraType.default_free, frame_rate=10, resolution_width=256, resolution_height=256
+        ),
+        # "bird_eye": SimCameraConfig(identifier="bird_eye_cam", type=int(CameraType.fixed), frame_rate=10, resolution_width=256, resolution_height=256),
+    }
 
 
 def get_tcp_offset(mjcf: str | Path) -> rcs.common.Pose:
diff --git a/python/rcs/hand/tilburg_hand.py b/python/rcs/hand/tilburg_hand.py
index 14cef140..7c0a9b6b 100644
--- a/python/rcs/hand/tilburg_hand.py
+++ b/python/rcs/hand/tilburg_hand.py
@@ -1,6 +1,6 @@
 import copy
-from dataclasses import dataclass
 import logging
+from dataclasses import dataclass
 from time import sleep
 
 import numpy as np
diff --git a/python/tests/test_sim_envs.py b/python/tests/test_sim_envs.py
index 3acbac81..ccc0bfa2 100644
--- a/python/tests/test_sim_envs.py
+++ b/python/tests/test_sim_envs.py
@@ -65,7 +65,7 @@ def test_reset(self, cfg, gripper_cfg, cam_cfg):
         # - test initial pose after reset.
         # - test initial gripper config.
         env = RCSSimEnvCreator()(
-            ControlMode.CARTESIAN_TRPY, cfg, gripper_cfg=gripper_cfg, camera_set_cfg=cam_cfg, max_relative_movement=None
+            ControlMode.CARTESIAN_TRPY, cfg, gripper_cfg=gripper_cfg, cameras=cam_cfg, max_relative_movement=None
         )
         # Test double reset. Regression test. A lot can go wrong when resetting.
         env.reset()
@@ -76,7 +76,7 @@ def test_zero_action_trpy(self, cfg):
         Test that a zero action does not change the state significantly
         """
         env = RCSSimEnvCreator()(
-            ControlMode.CARTESIAN_TRPY, cfg, gripper_cfg=None, camera_set_cfg=None, max_relative_movement=None
+            ControlMode.CARTESIAN_TRPY, cfg, gripper_cfg=None, cameras=None, max_relative_movement=None
         )
         obs_initial, _ = env.reset()
         zero_action = TRPYDictType(xyzrpy=obs_initial["xyzrpy"])
@@ -89,7 +89,7 @@ def test_non_zero_action_trpy(self, cfg):
         """
         # env creation
         env = RCSSimEnvCreator()(
-            ControlMode.CARTESIAN_TRPY, cfg, gripper_cfg=None, camera_set_cfg=None, max_relative_movement=None
+            ControlMode.CARTESIAN_TRPY, cfg, gripper_cfg=None, cameras=None, max_relative_movement=None
         )
         obs_initial, _ = env.reset()
         # action to be performed
@@ -111,7 +111,7 @@ def test_non_zero_action_trpy(self, cfg):
     def test_relative_zero_action_trpy(self, cfg, gripper_cfg):
         # env creation
         env = RCSSimEnvCreator()(
-            ControlMode.CARTESIAN_TRPY, cfg, gripper_cfg=gripper_cfg, camera_set_cfg=None, max_relative_movement=0.5
+            ControlMode.CARTESIAN_TRPY, cfg, gripper_cfg=gripper_cfg, cameras=None, max_relative_movement=0.5
         )
         obs_initial, _ = env.reset()
         # action to be performed
@@ -123,7 +123,7 @@ def test_relative_zero_action_trpy(self, cfg, gripper_cfg):
     def test_relative_non_zero_action(self, cfg, gripper_cfg):
         # env creation
         env = RCSSimEnvCreator()(
-            ControlMode.CARTESIAN_TRPY, cfg, gripper_cfg=gripper_cfg, camera_set_cfg=None, max_relative_movement=0.5
+            ControlMode.CARTESIAN_TRPY, cfg, gripper_cfg=gripper_cfg, cameras=None, max_relative_movement=0.5
         )
         obs_initial, _ = env.reset()
         # action to be performed
@@ -141,7 +141,7 @@ def test_collision_trpy(self, cfg, gripper_cfg):
         """
         # env creation
         env = RCSSimEnvCreator()(
-            ControlMode.CARTESIAN_TRPY, cfg, gripper_cfg=gripper_cfg, camera_set_cfg=None, max_relative_movement=None
+            ControlMode.CARTESIAN_TRPY, cfg, gripper_cfg=gripper_cfg, cameras=None, max_relative_movement=None
         )
         obs, _ = env.reset()
         # an obvious below ground collision action
@@ -162,7 +162,7 @@ def test_collision_guard_trpy(self, cfg, gripper_cfg):
             cfg,
             gripper_cfg=gripper_cfg,
             collision_guard=True,
-            camera_set_cfg=None,
+            cameras=None,
             max_relative_movement=None,
         )
         obs, _ = env.reset()
@@ -194,7 +194,7 @@ def test_reset(self, cfg, gripper_cfg, cam_cfg):
             ControlMode.CARTESIAN_TQuat,
             cfg,
             gripper_cfg=gripper_cfg,
-            camera_set_cfg=cam_cfg,
+            cameras=cam_cfg,
             max_relative_movement=None,
         )
         # Test double reset. Regression test. A lot can go wrong when resetting.
@@ -207,7 +207,7 @@ def test_non_zero_action_tquat(self, cfg):
         """
         # env creation
         env = RCSSimEnvCreator()(
-            ControlMode.CARTESIAN_TQuat, cfg, gripper_cfg=None, camera_set_cfg=None, max_relative_movement=None
+            ControlMode.CARTESIAN_TQuat, cfg, gripper_cfg=None, cameras=None, max_relative_movement=None
         )
         obs_initial, _ = env.reset()
         # action to be performed
@@ -228,7 +228,7 @@ def test_zero_action_tquat(self, cfg):
         """
         # env creation
         env = RCSSimEnvCreator()(
-            ControlMode.CARTESIAN_TQuat, cfg, gripper_cfg=None, camera_set_cfg=None, max_relative_movement=None
+            ControlMode.CARTESIAN_TQuat, cfg, gripper_cfg=None, cameras=None, max_relative_movement=None
         )
         obs_initial, info_ = env.reset()
         home_action_vec = obs_initial["tquat"]
@@ -242,7 +242,7 @@ def test_relative_zero_action_tquat(self, cfg, gripper_cfg):
             ControlMode.CARTESIAN_TQuat,
             cfg,
             gripper_cfg=gripper_cfg,
-            camera_set_cfg=None,
+            cameras=None,
             max_relative_movement=0.5,
         )
         obs_initial, _ = env_rel.reset()
@@ -260,7 +260,7 @@ def test_collision_tquat(self, cfg, gripper_cfg):
             ControlMode.CARTESIAN_TQuat,
             cfg,
             gripper_cfg=gripper_cfg,
-            camera_set_cfg=None,
+            cameras=None,
             max_relative_movement=None,
         )
         obs, _ = env.reset()
@@ -282,7 +282,7 @@ def test_collision_guard_tquat(self, cfg, gripper_cfg):
             cfg,
             gripper_cfg=gripper_cfg,
             collision_guard=True,
-            camera_set_cfg=None,
+            cameras=None,
             max_relative_movement=None,
         )
         obs, _ = env.reset()
@@ -311,7 +311,7 @@ def test_reset(self, cfg, gripper_cfg, cam_cfg):
         # - test initial pose after reset.
         # - test initial gripper config.
         env = RCSSimEnvCreator()(
-            ControlMode.JOINTS, cfg, gripper_cfg=gripper_cfg, camera_set_cfg=cam_cfg, max_relative_movement=None
+            ControlMode.JOINTS, cfg, gripper_cfg=gripper_cfg, cameras=cam_cfg, max_relative_movement=None
         )
         # Test double reset. Regression test. A lot can go wrong when resetting.
         env.reset()
@@ -322,9 +322,7 @@ def test_zero_action_joints(self, cfg):
         This is for testing that a certain action leads to the expected change in state
         """
         # env creation
-        env = RCSSimEnvCreator()(
-            ControlMode.JOINTS, cfg, gripper_cfg=None, camera_set_cfg=None, max_relative_movement=None
-        )
+        env = RCSSimEnvCreator()(ControlMode.JOINTS, cfg, gripper_cfg=None, cameras=None, max_relative_movement=None)
         obs_initial, _ = env.reset()
         # action to be performed
         zero_action = JointsDictType(joints=np.array(obs_initial["joints"]))
@@ -338,9 +336,7 @@ def test_non_zero_action_joints(self, cfg):
         This is for testing that a certain action leads to the expected change in state
         """
         # env creation
-        env = RCSSimEnvCreator()(
-            ControlMode.JOINTS, cfg, gripper_cfg=None, camera_set_cfg=None, max_relative_movement=None
-        )
+        env = RCSSimEnvCreator()(ControlMode.JOINTS, cfg, gripper_cfg=None, cameras=None, max_relative_movement=None)
         obs_initial, _ = env.reset()
         new_joint_vals = obs_initial["joints"] + np.array([0.1, 0.1, 0.1, 0.1, -0.1, -0.1, 0.1], dtype=np.float32)
         # action to be performed
@@ -356,7 +352,7 @@ def test_collision_joints(self, cfg, gripper_cfg):
         """
         # env creation
         env = RCSSimEnvCreator()(
-            ControlMode.JOINTS, cfg, gripper_cfg=gripper_cfg, camera_set_cfg=None, max_relative_movement=None
+            ControlMode.JOINTS, cfg, gripper_cfg=gripper_cfg, cameras=None, max_relative_movement=None
         )
         env.reset()
         # the below action is a test_case where there is an obvious collision regardless of the gripper action
@@ -375,7 +371,7 @@ def test_collision_guard_joints(self, cfg, gripper_cfg):
             cfg,
             gripper_cfg=gripper_cfg,
             collision_guard=True,
-            camera_set_cfg=None,
+            cameras=None,
             max_relative_movement=None,
         )
         env.reset()
@@ -397,7 +393,7 @@ def test_relative_zero_action_joints(self, cfg, gripper_cfg):
         """
         # env creation
         env = RCSSimEnvCreator()(
-            ControlMode.JOINTS, cfg, gripper_cfg=gripper_cfg, camera_set_cfg=None, max_relative_movement=0.5
+            ControlMode.JOINTS, cfg, gripper_cfg=gripper_cfg, cameras=None, max_relative_movement=0.5
         )
         obs_initial, _ = env.reset()
         act = JointsDictType(joints=np.array([0, 0, 0, 0, 0, 0, 0], dtype=np.float32))
diff --git a/src/common/Camera.h b/src/common/Camera.h
index 3077da19..8ecc98b9 100644
--- a/src/common/Camera.h
+++ b/src/common/Camera.h
@@ -11,11 +11,12 @@ struct BaseCameraConfig {
   int resolution_width;
   int resolution_height;
   BaseCameraConfig(const std::string &identifier, int frame_rate,
-                   int resolution_width, int resolution_height = 480)
+                   int resolution_width, int resolution_height)
       : identifier(identifier),
         frame_rate(frame_rate),
         resolution_width(resolution_width),
         resolution_height(resolution_height) {}
+  ~BaseCameraConfig() {}
 };
 
 }  // namespace common
diff --git a/src/pybind/rcs.cpp b/src/pybind/rcs.cpp
index 87fc9966..7ddb4e30 100644
--- a/src/pybind/rcs.cpp
+++ b/src/pybind/rcs.cpp
@@ -518,6 +518,10 @@ PYBIND11_MODULE(_core, m) {
       .export_values();
   py::class_<rcs::sim::SimCameraConfig, rcs::common::BaseCameraConfig>(
       sim, "SimCameraConfig")
+      .def(py::init<const std::string &, int, int, int, rcs::sim::CameraType>(),
+           py::arg("identifier"), py::arg("frame_rate"),
+           py::arg("resolution_width"), py::arg("resolution_height"),
+           py::arg("type") = rcs::sim::CameraType::fixed)
       .def_readwrite("type", &rcs::sim::SimCameraConfig::type);
   py::class_<rcs::sim::FrameSet>(sim, "FrameSet")
       .def(py::init<>())
diff --git a/src/sim/camera.h b/src/sim/camera.h
index 1fb7eef2..0863e599 100644
--- a/src/sim/camera.h
+++ b/src/sim/camera.h
@@ -25,6 +25,12 @@ enum CameraType {
 
 struct SimCameraConfig : common::BaseCameraConfig {
   CameraType type;
+  SimCameraConfig(const std::string& identifier, int frame_rate,
+                  int resolution_width, int resolution_height,
+                  CameraType type = fixed)
+      : common::BaseCameraConfig(identifier, frame_rate, resolution_width,
+                                 resolution_height),
+        type(type) {}
 };
 
 // (H,W,3)