diff --git a/README.md b/README.md index dd1fa7c..3e58f48 100644 --- a/README.md +++ b/README.md @@ -20,8 +20,9 @@ An environment is defined by: - ActionProcessor: an object of `type BaseAction` that defines functions that process the action. Generally simulator works by applying continous forces/torque, however the desired action space might be discrete or continuous. `BaseAction` objects allow the user to easily switch between different action spaces. - + ### Interactive Mode + To run environments in interactive mode, a function mapping the keyboard inputs into commands supplied to the agent must be defined. This function is typically called, `str2action` in `rlmaster` repository. The environment can be run in interactive mode in the following way: ``` @@ -31,7 +32,9 @@ env.interactive(move_single_env.str2action) ``` You can use the commands, `w, s, d, a` to move the agent and `q` to quit the interactive mode. - +### Cheetah in Interactive Mode + +No interface has been developed yet, but after creating an instance of SimpleStackerAgent in from `stacker_agent`, calling `_setup_renderer` and `render` will open a display window. You can step by entering an xy position that will move the second block there instantly. Calling `render` again will show the updated positions of the blocks. They are initialized at position (0,0) if not called with the initializer. ##Environment in openAI gym format @@ -41,8 +44,7 @@ from core import gym_wrapper env = move_agent.get_environment() gymEnv = gym_wwapper.GymWrapper(env) ``` - - + diff --git a/core/gym_wrapper.py b/core/gym_wrapper.py index 11d38b6..acb490d 100644 --- a/core/gym_wrapper.py +++ b/core/gym_wrapper.py @@ -1,6 +1,10 @@ import numpy as np -from core import base_environment +import rlmaster.core.base_environment as base_environment from gym import spaces +from rllab.spaces.box import Box +from rllab.core.serializable import Serializable +from rllab.misc.overrides import overrides +from rllab.misc import logger class GymWrapper(object): def __init__(self, env): @@ -11,16 +15,15 @@ def __init__(self, env): else: assert isinstance(self.env.action_processor, base_environment.BaseContinuousAction) - self.action_space = spaces.Box( + self.action_space = Box( low = env.action_processor.minval(), high = env.action_processor.maxval(), - shape=(env.action_processor.action_dim(), - env.action_processor.num_actions())) + shape=(env.action_processor.action_dim())) obsNdim = self.env.observation_ndim() - obsKeys = obsNdim.keys() + obsKeys = list(obsNdim.keys()) assert len(obsKeys) == 1, 'gym only supports one observation type' self._obsKey = obsKeys[0] - self.observation_space = spaces.Box(low=0, high=255, shape=obsNdim[obsKeys[0]]) + self.observation_space = Box(low=0, high=255, shape=obsNdim[obsKeys[0]]) @property def frameskip(self): @@ -59,7 +62,7 @@ def _step(self, action): self.env.step(action) obs = self._observation() reward = self.env.reward() - done = False + done = self.env.simulator.done # test? return obs, reward, done, dict(reward=reward) @@ -68,7 +71,7 @@ def step(self, action): def viewer_setup(self): - self.env._renderer_setup() + self.env.setup_renderer() def render(self): @@ -77,4 +80,3 @@ def render(self): def _render(self): return self.env.render() - diff --git a/envs/cheetah_agent.py b/envs/cheetah_agent.py new file mode 100644 index 0000000..cc2351b --- /dev/null +++ b/envs/cheetah_agent.py @@ -0,0 +1,135 @@ +from rlmaster.core.base_environment import * +import numpy as np +from overrides import overrides +from rllab.mujoco_py import MjModel, MjViewer + +class HalfCheetahSimulator(BaseSimulator): + def __init__(self, **kwargs): + super(HalfCheetahSimulator, self).__init__(**kwargs) + + self._imSz = 512 + self._im = np.zeros((self._imSz, self._imSz, 3), dtype=np.uint8) + + self.model = MjModel('../rlmaster/envs/mujoco_envs/xmls/half_cheetah.xml') + self.viewer = None + self._pos = {} + self._pos['torso'] = np.zeros((3,)) + self._range_min = -1 + self._range_max = 1 + + self.body_comvel = 0 + self.action = np.zeros((1, 2)) + self.init_qpos = self.model.data.qpos + self.init_qvel = self.model.data.qvel + self.init_qacc = self.model.data.qacc + self.init_ctrl = self.model.data.ctrl + self.frame_skip = 1 + + @overrides + def step(self, ctrl, loop=False): + ctrl = np.clip(ctrl, *(np.array([-1, -1, -1, -1, -1, -1]), np.array([1, 1, 1, 1, 1, 1]))) + self.model.data.ctrl = ctrl # + np.random.normal(size=ctrl.shape) + # print('gym', self.model.data.ctrl) + for i in range(self.frame_skip): + self.model.step() + self.model.forward() + ind = self.model.body_names.index('torso') + self._pos['torso'] = self.model.body_pos[ind] + self.body_comvel = self.model.body_comvels[ind] + self.action = ctrl + + @overrides + def get_image(self): + data, width, height = self.viewer.get_image() + self._im = np.fromstring(data, dtype='uint8').reshape(height, width, 3)[::-1,:,:] + # print(self._im) + return self._im.copy() + + @overrides + def _setup_renderer(self): + self.viewer = MjViewer(visible=True, init_width=self._imSz, init_height=self._imSz) + self.viewer.start() + self.viewer.set_model(self.model) + + @overrides + def render(self): + self.viewer.loop_once() + +class CheetahIm(BaseObservation): + + def get_body_com(self, body_name): + idx = self.simulator.model.body_names.index(body_name) + return self.simulator.model.data.com_subtree[idx] + + @overrides + def ndim(self): + dim = {} + dim['im'] = (1, 20) + return dim + + @overrides + def observation(self): + obs = {} + current_obs = np.concatenate([ + self.simulator.model.data.qpos.flatten()[1:], + self.simulator.model.data.qvel.flat, + self.get_body_com('torso').flat, + ]) + obs['im'] = current_obs + return obs + +class RewardCheetah(BaseRewarder): + + @property + def action(self): + return self.prms['sim'].action if hasattr(self.prms['sim'], 'action') else np.zeros((1,2)) + + @property + def body_comvel(self): + return self.prms['sim'].body_comvel if hasattr(self.prms['sim'], 'body_comvel') else 0 + + @overrides + def get(self): + ctrl_cost = 1e-1 * 0.5 * np.sum(np.square(self.action)) + run_cost = -1 * self.body_comvel[0] + return -(ctrl_cost + run_cost) + +class ContinuousCheetahAction(BaseContinuousAction): + @overrides + def action_dim(self): + return 6 + + def minval(self): + return -1.25 + + def maxval(self): + return 1.1 + + @overrides + def process(self, action): + return action + +# TODO(jasmine): fix this so it initializes (is it called at the start of each epoch? find out) +class InitCheetah(BaseInitializer): + + def reset_mujoco(self): + self.simulator.model.data.qpos = self.simulator.init_qpos + 0.01 * np.random.normal(size=self.simulator.init_qpos.shape) + self.simulator.model.data.qvel = self.simulator.init_qvel + 0.1 * np.random.normal(size=self.simulator.init_qvel.shape) + self.simulator.model.data.qacc = self.simulator.init_qacc + self.simulator.model.data.ctrl = self.simulator.init_ctrl + + @overrides + def sample_env_init(self): + self.reset_mujoco() + self.simulator.model.forward() + +def get_environment(max_episode_length=100, initPrms={}, obsPrms={}, actPrms={}): + sim = HalfCheetahSimulator() + initObj = InitCheetah(sim, initPrms) + obsObj = CheetahIm(sim, obsPrms) + rewPrms = { 'sim': sim } + rewObj = RewardCheetah(sim, rewPrms) + actObj = ContinuousCheetahAction(actPrms) + env = BaseEnvironment(sim, initObj, obsObj, rewObj, actObj, + params={'max_episode_length':max_episode_length}) + return env diff --git a/envs/move_agent.py b/envs/move_agent.py index 28f1945..e8c78ff 100644 --- a/envs/move_agent.py +++ b/envs/move_agent.py @@ -1,6 +1,8 @@ -from core.base_environment import * +from rlmaster.core.base_environment import * import numpy as np from overrides import overrides +from rllab.mujoco_py import MjModel + from pyhelper_fns import vis_utils def str2action(cmd): @@ -54,7 +56,6 @@ def process(self, action): raise Exception('Action %s not recognized' % action) ctrl = np.array(ctrl).reshape((2,)) return ctrl - class ContinuousAction(BaseContinuousAction): @overrides @@ -65,6 +66,11 @@ def action_dim(self): def process(self, action): return action + def minval(self): + return -1 + + def maxval(self): + return 1 class MoveTeleportSimulator(BaseSimulator): def __init__(self, **kwargs): @@ -79,7 +85,7 @@ def __init__(self, **kwargs): #Manipulate radius self._manipulate_radius = 0.2 #Image size - self._imSz = 64 + self._imSz = 32 self._im = np.zeros((self._imSz, self._imSz, 3), dtype=np.uint8) def object_names(self): @@ -148,7 +154,6 @@ def _setup_renderer(self): def render(self): self._canvas._display(self.get_image()) - class InitFixed(BaseInitializer): @overrides def sample_env_init(self): @@ -165,7 +170,6 @@ def sample_env_init(self): self.simulator._pos[k] = range_mag * self.random.rand(2,) + \ self.simulator._range_min - class ObsState(BaseObservation): @overrides def ndim(self): @@ -180,22 +184,21 @@ def observation(self): for i, k in enumerate(self.simulator._pos.keys()): obs[2*i, 2*i + 2] = self.simulator._pos[k].copy() return obs - class ObsIm(BaseObservation): @overrides def ndim(self): dim = {} - dim['im'] = (self.simulator._imSz, self.simulator._imSz, 3) + # dim['im'] = (self.simulator._imSz, self.simulator._imSz, 3) + dim['im'] = (3072, 1) return dim @overrides def observation(self): obs = {} - obs['im'] = self.simulator.get_image() + obs['im'] = self.simulator.get_image().flatten() return obs - class RewardSimple(BaseRewarder): #The radius around the goal in which reward is provided to the agent. @property @@ -208,15 +211,15 @@ def get(self): return 1 else: return 0 - def get_environment(initName='InitRandom', obsName='ObsIm', rewName='RewardSimple', - actType='DiscreteActionFour', max_episode_length=100, - initPrms={}, obsPrms={}, rewPrms={}, actPrms={}): - - sim = MoveTeleportSimulator() + actType='DiscreteActionFour', simName='MoveTeleportSimulator', + max_episode_length=100, initPrms={}, obsPrms={}, rewPrms={}, + actPrms={}): + sim = globals()[simName]() initObj = globals()[initName](sim, initPrms) obsObj = globals()[obsName](sim, obsPrms) + rewPrms = { 'sim': sim } rewObj = globals()[rewName](sim, rewPrms) actObj = globals()[actType](actPrms) env = BaseEnvironment(sim, initObj, obsObj, rewObj, actObj, diff --git a/envs/mujoco_envs/xmls/half_cheetah.xml b/envs/mujoco_envs/xmls/half_cheetah.xml new file mode 100644 index 0000000..5f7b25f --- /dev/null +++ b/envs/mujoco_envs/xmls/half_cheetah.xml @@ -0,0 +1,95 @@ + + + + + + + + + + diff --git a/envs/reacher_agent.py b/envs/reacher_agent.py new file mode 100644 index 0000000..26da313 --- /dev/null +++ b/envs/reacher_agent.py @@ -0,0 +1,214 @@ +from rlmaster.core.base_environment import * +import numpy as np +from overrides import overrides +from pyhelper_fns import vis_utils + +def contains(obj1, obj2): + if not obj1 or not obj2: + return False + x1, y1 = obj1.pos[0], obj1.pos[1] + x2, y2 = obj2.pos[0], obj2.pos[1] + s1, s2 = obj1.size, obj2.size + x1, y1 = x1, y1 + x2, y2 = x2, y2 + if x2 + 2 * s2 <= x1 or x1 + 2 * s1 <= x2: + return False + if y2 + 2 * s2 <= y1 or y1 + 2 * s1 <= y2: + return False + return True + +class StackedBox(): + def __init__(self, pos, size=2): + self.pos = pos.reshape((3,)) + self.size = size + +class ReacherArm(): + def __init__(self, pos, size=2): + self.pos = pos.reshape((2,)) + self.size = size + + self.block = None + +class SimpleReacherSimulator(BaseSimulator): + def __init__(self, num_blocks=2, **kwargs): + # should always have same size blocks (width = height = length = n) + super(SimpleReacherSimulator, self).__init__(**kwargs) + self._imSz = 32 + self.width = self._imSz + self.height = self._imSz + self.move_block = [StackedBox(np.zeros(3)) for i in range(num_blocks)] + self.goal = StackedBox(np.zeros(3)) + self.reacher_arm = ReacherArm(np.zeros(2)) + + self._im = np.zeros((self._imSz, self._imSz, 3), dtype=np.uint8) + self._range_min = self.reacher_arm.size + self._range_max = self._imSz - self.reacher_arm.size + + @property + def done(self): + for block in self.move_block: + if block.pos[2] != self.goal.size: + return False + return True + + @overrides + def step(self, action): + action = action.reshape((2,)) + + on_goal = contains(self.reacher_arm.block, self.goal) + on_block = None + for block in self.move_block: + if contains(self.reacher_arm, block): + on_block = block + break + + if self.reacher_arm.block is None and on_block is not None and not contains(on_block, self.goal): + self.reacher_arm.block = on_block + + if on_goal: + self.reacher_arm.block.pos[2] = self.goal.size + self.reacher_arm.block = None + + self.reacher_arm.pos = np.clip(self.reacher_arm.pos + action, self._range_min, self._range_max) + + if self.reacher_arm.block is not None: + block = self.reacher_arm.block + action = np.append(action, 0) + block.pos = np.clip(block.pos + action, self._range_min, self._range_max) + block.pos[2] = self._imSz + + def _plot_object(self, coords, box, color='r'): + x, y = coords + mnx, mxx = int(max(x - box[0], 0)), int(min(self._imSz, x + box[0])) + mny, mxy = int(max(y - box[1], 0)), int(min(self._imSz, y + box[1])) + if color == 'r': + self._im[mny:mxy, mnx:mxx, 0] = 255 + elif color == 'g': + self._im[mny:mxy, mnx:mxx, 1] = 255 + else: + self._im[mny:mxy, mnx:mxx, 2] = 255 + + @overrides + def get_image(self): + # other blocks in red, active block in green + imSz = self._imSz + self._im = np.zeros((imSz, imSz, 3), dtype=np.uint8) + x, y = self.goal.pos[0], self.goal.pos[1] + self._plot_object((x, y), (2, 2), 'r') + for block in self.move_block: + x, y = block.pos[0], block.pos[1] + self._plot_object((x, y), (2, 2), 'g') + x, y = self.reacher_arm.pos[0], self.reacher_arm.pos[1] + self._plot_object((x, y), (2, 2), 'b') + return self._im.copy() + + @overrides + def _setup_renderer(self): + self._canvas = vis_utils.MyAnimation(None, height=self._imSz, width=self._imSz) + + @overrides + def render(self): + if not hasattr(self, '_canvas'): + self._setup_renderer() + self._canvas._display(self.get_image()) + +class StackerIm(BaseObservation): + + @overrides + def ndim(self): + dim = {} + dim['im'] = (self.simulator._imSz, self.simulator._imSz, 3) + return dim + + @overrides + def observation(self): + obs = {} + obs['im'] = self.simulator.get_image() + return obs + +class StateIm(BaseObservation): + + @overrides + def ndim(self): + dim = {} + num_blocks = len(self.simulator.move_block) + dim['im'] = (num_blocks * 3 + 6, 1) + return dim + + def scale(self, obs): + obs = np.copy(obs) + obs = obs / float(self.simulator._imSz) * 2 - 1 + return obs + + @overrides + def observation(self): + obs = {} + + m_block = self.simulator.move_block + g_pos = self.simulator.goal.pos + r_pos = self.simulator.reacher_arm.pos + new_im = np.append(self.scale(g_pos), self.scale(r_pos)) + new_im = np.append(new_im, np.array([1 if self.simulator.reacher_arm.block is not None else 0])) + for b in m_block: + new_im = np.append(new_im, self.scale(b.pos)) + obs['im'] = new_im + return obs + +####@pulkitag: This is good. +class RewardReacher(BaseRewarder): + + @property + def contains_block(self): + if hasattr(self.prms['sim'], 'move_block') and hasattr(self.prms['sim'], 'goal'): + move_block = self.prms['sim'].move_block + goal = self.prms['sim'].goal + reward = 1 if all([block.pos[2] == goal.size for block in move_block]) else 0 + return reward + return 0 + + @overrides + def get(self): + return self.contains_block + +# need additional actions: to pick up and to drop the block +class ContinuousReacherAction(BaseContinuousAction): + @overrides + def action_dim(self): + return 2 + + @overrides + def process(self, action): + return action + + def minval(self): + return -1 + + def maxval(self): + return 1 + +class InitReacher(BaseInitializer): + @overrides + def sample_env_init(self): + sim = self.simulator['sim'] + sim.reacher_arm.pos = np.random.randint(sim._range_min, sim._range_max, size=2) + sim.reacher_arm.block = None + + sim.goal.pos = np.random.randint(sim._range_min, sim._range_max, size=3) + sim.goal.pos[2] = 0 + for block in sim.move_block: + block.pos = np.random.randint(sim._range_min, sim._range_max, size=3) + block.pos[2] = 0 + if contains(sim.goal, block): + block.pos[2] = sim.goal.size + +def get_environment(obsType='StateIm', max_episode_length=100, initPrms={}, obsPrms={}, rewPrms={}, actPrms={}): + sim = SimpleReacherSimulator() + initPrms = {'sim' : sim} + initObj = InitReacher(initPrms) + obsObj = globals()[obsType](sim, obsPrms) + rewPrms = { 'sim': sim } + rewObj = RewardReacher(sim, rewPrms) + actObj = ContinuousReacherAction(actPrms) + env = BaseEnvironment(sim, initObj, obsObj, rewObj, actObj, + params={'max_episode_length':max_episode_length}) + return env diff --git a/envs/stacker_agent.py b/envs/stacker_agent.py new file mode 100644 index 0000000..d8b81ce --- /dev/null +++ b/envs/stacker_agent.py @@ -0,0 +1,168 @@ +from rlmaster.core.base_environment import * +import numpy as np +from overrides import overrides +from pyhelper_fns import vis_utils + +def contains(obj1, obj2): + x1, y1 = obj1.pos[0], obj1.pos[1] + x2, y2 = obj2.pos[0], obj2.pos[1] + s1, s2 = obj1.size, obj2.size + x1, y1 = x1 - s1, y1 - s1 + x2, y2 = x2 - s2, y2 - s2 + if x2 + s2 <= x1 or x1 + s1 <= x2: + return False + if y2 + s2 <= y1 or y1 + s1 <= y2: + return False + return True + +class StackedBox(): + def __init__(self, pos, size=2): + # position is center of the box + self.pos = pos.reshape((3,)) + self.size = size + +class SimpleStackerSimulator(BaseSimulator): + def __init__(self, **kwargs): + super(SimpleStackerSimulator, self).__init__(**kwargs) + self._imSz = 32 + self.width = self._imSz + self.height = self._imSz + self.move_block = StackedBox(np.zeros(3)) + self.other_block = StackedBox(np.zeros(3)) + + self._im = np.zeros((self._imSz, self._imSz, 3), dtype=np.uint8) + self._range_min = self.move_block.size + self._range_max = self._imSz - self.move_block.size + + @overrides + def step(self, pos): + #####@pulkitag: You should make use of BaseDiscreteAction and BaseContinuousAction + #### to input the actions. + pos = pos.reshape((2,)) + pos = np.append(pos, np.zeros(1)) + + new_pos = np.clip(self.move_block.pos + pos, self._range_min, self._range_max) + new_pos[2] = 0 + self.move_block.pos = new_pos + if contains(self.move_block, self.other_block): + self.move_block.pos[2] = self.other_block.size + + def _plot_object(self, coords, color='r'): + x, y = coords + mnx, mxx = max(x - self.move_block.size, 0), min(self._imSz, x + self.move_block.size) + mny, mxy = max(y - self.move_block.size, 0), min(self._imSz, y + self.move_block.size) + if color == 'r': + self._im[mny:mxy, mnx:mxx, 0] = 255 + elif color == 'g': + self._im[mny:mxy, mnx:mxx, 1] = 255 + else: + self._im[mny:mxy, mnx:mxx, 2] = 255 + + @overrides + def get_image(self): + imSz = self._imSz + rng = np.linspace(self._range_min, self._range_max, imSz) + x_1, y_1 = self.other_block.pos[0], self.other_block.pos[1] + x_2, y_2 = self.move_block.pos[0], self.move_block.pos[1] + self._im = np.zeros((imSz, imSz, 3), dtype=np.uint8) + self._plot_object((x_1, y_1), 'r') + self._plot_object((x_2, y_2), 'g') + return self._im.copy() + + @overrides + def _setup_renderer(self): + self._canvas = vis_utils.MyAnimation(None, height=self._imSz, width=self._imSz) + + @overrides + def render(self): + self._canvas._display(self.get_image()) + +class StackerIm(BaseObservation): + + @overrides + def ndim(self): + dim = {} + dim['im'] = (self.simulator._imSz, self.simulator._imSz, 3) + return dim + + @overrides + def observation(self): + obs = {} + obs['im'] = self.simulator.get_image() + return obs + +class StateIm(BaseObservation): + + @overrides + def ndim(self): + dim = {} + dim['im'] = (6, 1) + return dim + + def scale(self, obs): + norm = np.linalg.norm(obs) + obs = np.copy(obs) + obs = obs / 16 - 1 + return obs + + @overrides + def observation(self): + obs = {} + m_pos = self.simulator.move_block.pos + o_pos = self.simulator.other_block.pos + new_im = np.append(self.scale(m_pos), self.scale(o_pos)) + obs['im'] = new_im + return obs + + +####@pulkitag: This is good. +class RewardStacker(BaseRewarder): + + @property + def block_height(self): + if hasattr(self.prms['sim'], 'other_block') and hasattr(self.prms['sim'], 'move_block'): + return 1 if self.prms['sim'].move_block.pos[2] == self.prms['sim'].other_block.size else 0 + return 0 + + @overrides + def get(self): + return self.block_height + +class ContinuousStackerAction(BaseContinuousAction): + @overrides + def action_dim(self): + return 2 + + @overrides + def process(self, action): + return action + + def minval(self): + return -1 + + def maxval(self): + return 1 + +class InitStacker(BaseInitializer): + @overrides + def sample_env_init(self): + sim = self.simulator['sim'] + size = sim.move_block.size + sim.move_block.pos = np.random.randint(sim._range_min, sim._range_max, size=3) + sim.move_block.pos[2] = 0 + sim.other_block.pos = np.random.randint(sim._range_min, sim._range_max, size=3) + sim.other_block.pos[2] = 0 + if contains(sim.move_block, sim.other_block): + sim.move_block.pos[2] = size + +def get_environment(obsType='StateIm', max_episode_length=100, initPrms={}, obsPrms={}, rewPrms={}, actPrms={}): + sim = SimpleStackerSimulator() + rewPrms = { 'sim': sim } + initPrms = { 'sim': sim } + initObj = InitStacker(initPrms) + obsObj = globals()[obsType](sim, obsPrms) + rewObj = RewardStacker(sim, rewPrms) + actObj = ContinuousStackerAction(actPrms) + env = BaseEnvironment(sim, initObj, obsObj, rewObj, actObj, + params={'max_episode_length':max_episode_length}) + return env