pulkitag · JasmineDeng · Mar 21, 2017 · Mar 24, 2017 · Mar 24, 2017 · Mar 24, 2017
diff --git a/README.md b/README.md
@@ -20,8 +20,9 @@ An environment is defined by:
 - ActionProcessor: an object of `type BaseAction` that defines functions that process the action. Generally simulator works
                    by applying continous forces/torque, however the desired action space might be discrete or continuous. 
                    `BaseAction` objects allow the user to easily switch between different action spaces. 
-                   
+
 ### Interactive Mode
+
 To run environments in interactive mode, a function mapping the keyboard inputs into commands supplied to the agent must be defined. This function is typically called, `str2action` in `rlmaster` repository. The environment can be run in interactive mode in the following way:
 
 ```
@@ -31,7 +32,9 @@ env.interactive(move_single_env.str2action)
 ```
 You can use the commands, `w, s, d, a` to move the agent and `q` to quit the interactive mode.
 
-
+### Cheetah in Interactive Mode
+
+No interface has been developed yet, but after creating an instance of SimpleStackerAgent in from `stacker_agent`, calling `_setup_renderer` and `render` will open a display window. You can step by entering an xy position that will move the second block there instantly. Calling `render` again will show the updated positions of the blocks. They are initialized at position (0,0) if not called with the initializer.
 
 ##Environment in openAI gym format
 
@@ -41,8 +44,7 @@ from core import gym_wrapper
 env = move_agent.get_environment()
 gymEnv = gym_wwapper.GymWrapper(env)
 ```
-
-
+
 
 
 
diff --git a/core/gym_wrapper.py b/core/gym_wrapper.py
@@ -1,6 +1,10 @@
 import numpy as np
-from core import base_environment
+import rlmaster.core.base_environment as base_environment
 from gym import spaces
+from rllab.spaces.box import Box
+from rllab.core.serializable import Serializable
+from rllab.misc.overrides import overrides
+from rllab.misc import logger
 
 class GymWrapper(object):
   def __init__(self, env):
@@ -11,16 +15,15 @@ def __init__(self, env):
     else:
       assert isinstance(self.env.action_processor,
                         base_environment.BaseContinuousAction)
-      self.action_space = spaces.Box(
+      self.action_space = Box(
                       low  = env.action_processor.minval(), 
                       high = env.action_processor.maxval(), 
-                      shape=(env.action_processor.action_dim(), 
-                             env.action_processor.num_actions()))
+                      shape=(env.action_processor.action_dim()))
     obsNdim = self.env.observation_ndim()   
-    obsKeys = obsNdim.keys()
+    obsKeys = list(obsNdim.keys())
     assert len(obsKeys) == 1, 'gym only supports one observation type'
     self._obsKey = obsKeys[0]
-    self.observation_space = spaces.Box(low=0, high=255, shape=obsNdim[obsKeys[0]])
+    self.observation_space = Box(low=0, high=255, shape=obsNdim[obsKeys[0]])
 
   @property
   def frameskip(self):
@@ -59,7 +62,7 @@ def _step(self, action):
     self.env.step(action)
     obs    = self._observation()
     reward = self.env.reward()
-    done   = False
+    done   = self.env.simulator.done # test?
     return obs, reward, done, dict(reward=reward)
 
 
@@ -68,7 +71,7 @@ def step(self, action):
 
 
   def viewer_setup(self):
-    self.env._renderer_setup() 
+    self.env.setup_renderer() 
 
 
   def render(self):
@@ -77,4 +80,3 @@ def render(self):
 
   def _render(self):
     return self.env.render() 
-
diff --git a/envs/cheetah_agent.py b/envs/cheetah_agent.py
@@ -0,0 +1,135 @@
+from rlmaster.core.base_environment import *
+import numpy as np
+from overrides import overrides
+from rllab.mujoco_py import MjModel, MjViewer
+
+class HalfCheetahSimulator(BaseSimulator):
+	def __init__(self, **kwargs):
+		super(HalfCheetahSimulator, self).__init__(**kwargs)
+
+		self._imSz = 512
+		self._im = np.zeros((self._imSz, self._imSz, 3), dtype=np.uint8)
+
+		self.model = MjModel('../rlmaster/envs/mujoco_envs/xmls/half_cheetah.xml')
+		self.viewer = None
+		self._pos = {}
+		self._pos['torso'] = np.zeros((3,))
+		self._range_min = -1
+		self._range_max = 1
+
+		self.body_comvel = 0
+		self.action = np.zeros((1, 2))
+		self.init_qpos = self.model.data.qpos
+		self.init_qvel = self.model.data.qvel 
+		self.init_qacc = self.model.data.qacc 
+		self.init_ctrl = self.model.data.ctrl
+		self.frame_skip = 1
+
+	@overrides
+	def step(self, ctrl, loop=False):
+		ctrl = np.clip(ctrl, *(np.array([-1, -1, -1, -1, -1, -1]), np.array([1, 1, 1, 1, 1, 1])))
+		self.model.data.ctrl = ctrl # + np.random.normal(size=ctrl.shape)
+		# print('gym', self.model.data.ctrl)
+		for i in range(self.frame_skip):
+		  self.model.step()
+		self.model.forward()
+		ind = self.model.body_names.index('torso')
+		self._pos['torso'] = self.model.body_pos[ind]
+		self.body_comvel = self.model.body_comvels[ind]
+		self.action = ctrl
+
+	@overrides 
+	def get_image(self):
+		data, width, height = self.viewer.get_image()
+		self._im = np.fromstring(data, dtype='uint8').reshape(height, width, 3)[::-1,:,:]
+		# print(self._im)
+		return self._im.copy()
+
+	@overrides
+	def _setup_renderer(self):
+		self.viewer = MjViewer(visible=True, init_width=self._imSz, init_height=self._imSz)
+		self.viewer.start()
+		self.viewer.set_model(self.model)
+
+	@overrides
+	def render(self):
+  		self.viewer.loop_once()
+
+class CheetahIm(BaseObservation):
+
+	def get_body_com(self, body_name):
+		idx = self.simulator.model.body_names.index(body_name)
+		return self.simulator.model.data.com_subtree[idx]
+
+	@overrides
+	def ndim(self):
+		dim = {}
+		dim['im'] = (1, 20)
+		return dim
+
+	@overrides
+	def observation(self):
+		obs = {}
+		current_obs = np.concatenate([
+			self.simulator.model.data.qpos.flatten()[1:],
+			self.simulator.model.data.qvel.flat,
+			self.get_body_com('torso').flat,
+		])
+		obs['im'] = current_obs
+		return obs
+
+class RewardCheetah(BaseRewarder):
+
+	@property
+	def action(self):
+		return self.prms['sim'].action if hasattr(self.prms['sim'], 'action') else np.zeros((1,2))
+
+	@property
+	def body_comvel(self):
+		return self.prms['sim'].body_comvel if hasattr(self.prms['sim'], 'body_comvel') else 0
+
+	@overrides
+	def get(self):
+		ctrl_cost = 1e-1 * 0.5 * np.sum(np.square(self.action))
+		run_cost = -1 * self.body_comvel[0]
+		return -(ctrl_cost + run_cost)
+
+class ContinuousCheetahAction(BaseContinuousAction):
+	@overrides
+	def action_dim(self):
+		return 6
+
+	def minval(self):
+		return -1.25
+
+	def maxval(self):
+		return 1.1
+
+	@overrides
+	def process(self, action):
+		return action
+
+# TODO(jasmine): fix this so it initializes (is it called at the start of each epoch? find out)
+class InitCheetah(BaseInitializer):
+
+	def reset_mujoco(self):
+		self.simulator.model.data.qpos = self.simulator.init_qpos + 0.01 * np.random.normal(size=self.simulator.init_qpos.shape)
+		self.simulator.model.data.qvel = self.simulator.init_qvel + 0.1 * np.random.normal(size=self.simulator.init_qvel.shape)
+		self.simulator.model.data.qacc = self.simulator.init_qacc
+		self.simulator.model.data.ctrl = self.simulator.init_ctrl
+
+	@overrides
+	def sample_env_init(self):
+		self.reset_mujoco()
+		self.simulator.model.forward()
+
+def get_environment(max_episode_length=100, initPrms={}, obsPrms={}, actPrms={}):
+	sim = HalfCheetahSimulator()
+	initObj = InitCheetah(sim, initPrms)
+	obsObj = CheetahIm(sim, obsPrms)
+	rewPrms = { 'sim': sim }
+	rewObj = RewardCheetah(sim, rewPrms)
+	actObj = ContinuousCheetahAction(actPrms)
+	env = BaseEnvironment(sim, initObj, obsObj, rewObj, actObj, 
+		params={'max_episode_length':max_episode_length})
+	return env
diff --git a/envs/move_agent.py b/envs/move_agent.py
@@ -1,6 +1,8 @@
-from core.base_environment import *
+from rlmaster.core.base_environment import *
 import numpy as np
 from overrides import overrides
+from rllab.mujoco_py import MjModel
+
 from pyhelper_fns import vis_utils
 
 def str2action(cmd):
@@ -54,7 +56,6 @@ def process(self, action):
       raise Exception('Action %s not recognized' % action)
     ctrl = np.array(ctrl).reshape((2,))
     return ctrl 
-
 
 class ContinuousAction(BaseContinuousAction):
   @overrides
@@ -65,6 +66,11 @@ def action_dim(self):
   def process(self, action):
     return action  
 
+  def minval(self):
+    return -1
+
+  def maxval(self):
+    return 1
 
 class MoveTeleportSimulator(BaseSimulator):
   def __init__(self, **kwargs):
@@ -79,7 +85,7 @@ def __init__(self, **kwargs):
     #Manipulate radius
     self._manipulate_radius = 0.2
     #Image size
-    self._imSz = 64 
+    self._imSz = 32 
     self._im = np.zeros((self._imSz, self._imSz, 3), dtype=np.uint8)      
 
   def object_names(self):
@@ -148,7 +154,6 @@ def _setup_renderer(self):
   def render(self):
     self._canvas._display(self.get_image())
 
-
 class InitFixed(BaseInitializer):
   @overrides
   def sample_env_init(self):
@@ -165,7 +170,6 @@ def sample_env_init(self):
       self.simulator._pos[k] = range_mag * self.random.rand(2,) + \
                                self.simulator._range_min
 
-
 class ObsState(BaseObservation):
   @overrides
   def ndim(self):
@@ -180,22 +184,21 @@ def observation(self):
     for i, k in enumerate(self.simulator._pos.keys()):
       obs[2*i, 2*i + 2] = self.simulator._pos[k].copy()
     return obs
-
 
 class ObsIm(BaseObservation):
   @overrides
   def ndim(self):
     dim = {}
-    dim['im'] = (self.simulator._imSz, self.simulator._imSz, 3)
+    # dim['im'] = (self.simulator._imSz, self.simulator._imSz, 3)
+    dim['im'] = (3072, 1)
     return dim
 
   @overrides
   def observation(self):
     obs = {}
-    obs['im'] =  self.simulator.get_image()
+    obs['im'] =  self.simulator.get_image().flatten()
     return obs
 
-
 class RewardSimple(BaseRewarder):
   #The radius around the goal in which reward is provided to the agent.
   @property
@@ -208,15 +211,15 @@ def get(self):
       return 1
     else:
       return 0 
-
 
 def get_environment(initName='InitRandom', obsName='ObsIm', rewName='RewardSimple',
-                    actType='DiscreteActionFour', max_episode_length=100,
-                    initPrms={}, obsPrms={}, rewPrms={}, actPrms={}):
-
-  sim     = MoveTeleportSimulator()
+                    actType='DiscreteActionFour', simName='MoveTeleportSimulator',
+                    max_episode_length=100, initPrms={}, obsPrms={}, rewPrms={}, 
+                    actPrms={}):
+  sim     = globals()[simName]()
   initObj = globals()[initName](sim, initPrms)
   obsObj  = globals()[obsName](sim, obsPrms)
+  rewPrms = { 'sim': sim }
   rewObj  = globals()[rewName](sim, rewPrms)
   actObj  = globals()[actType](actPrms)
   env     = BaseEnvironment(sim, initObj, obsObj, rewObj, actObj,