From c2ef0b65f9013f69b8deb93759f69e9c27568ef9 Mon Sep 17 00:00:00 2001
From: JasmineDeng <jasmine.deng@gmail.com>
Date: Mon, 20 Mar 2017 18:19:29 -0700
Subject: [PATCH 01/10] implemented cheetah

---
 core/gym_wrapper.py                    | 19 +++---
 envs/move_agent.py                     | 77 +++++++++++++++++++--
 envs/mujoco_envs/xmls/half_cheetah.xml | 95 ++++++++++++++++++++++++++
 3 files changed, 176 insertions(+), 15 deletions(-)
 create mode 100644 envs/mujoco_envs/xmls/half_cheetah.xml

diff --git a/core/gym_wrapper.py b/core/gym_wrapper.py
index 11d38b6..dd9d9d6 100644
--- a/core/gym_wrapper.py
+++ b/core/gym_wrapper.py
@@ -1,6 +1,10 @@
 import numpy as np
-from core import base_environment
+import rlmaster.core.base_environment as base_environment
 from gym import spaces
+from rllab.spaces.box import Box
+from rllab.core.serializable import Serializable
+from rllab.misc.overrides import overrides
+from rllab.misc import logger
 
 class GymWrapper(object):
   def __init__(self, env):
@@ -11,16 +15,15 @@ def __init__(self, env):
     else:
       assert isinstance(self.env.action_processor,
                         base_environment.BaseContinuousAction)
-      self.action_space = spaces.Box(
+      self.action_space = Box(
                       low  = env.action_processor.minval(), 
                       high = env.action_processor.maxval(), 
-                      shape=(env.action_processor.action_dim(), 
-                             env.action_processor.num_actions()))
+                      shape=(env.action_processor.action_dim()))
     obsNdim = self.env.observation_ndim()   
-    obsKeys = obsNdim.keys()
+    obsKeys = list(obsNdim.keys())
     assert len(obsKeys) == 1, 'gym only supports one observation type'
     self._obsKey = obsKeys[0]
-    self.observation_space = spaces.Box(low=0, high=255, shape=obsNdim[obsKeys[0]])
+    self.observation_space = Box(low=0, high=255, shape=obsNdim[obsKeys[0]])
 
   @property
   def frameskip(self):
@@ -68,7 +71,7 @@ def step(self, action):
 
 
   def viewer_setup(self):
-    self.env._renderer_setup() 
+    self.env.setup_renderer() 
 
 
   def render(self):
@@ -76,5 +79,5 @@ def render(self):
 
 
   def _render(self):
+    print('i am rendering')
     return self.env.render() 
-  
diff --git a/envs/move_agent.py b/envs/move_agent.py
index 28f1945..01811cb 100644
--- a/envs/move_agent.py
+++ b/envs/move_agent.py
@@ -1,6 +1,8 @@
-from core.base_environment import *
+from rlmaster.core.base_environment import *
 import numpy as np
 from overrides import overrides
+from rllab.mujoco_py import MjModel
+
 from pyhelper_fns import vis_utils
 
 def str2action(cmd):
@@ -54,7 +56,6 @@ def process(self, action):
       raise Exception('Action %s not recognized' % action)
     ctrl = np.array(ctrl).reshape((2,))
     return ctrl 
- 
 
 class ContinuousAction(BaseContinuousAction):
   @overrides
@@ -65,6 +66,11 @@ def action_dim(self):
   def process(self, action):
     return action  
 
+  def minval(self):
+    return -1
+
+  def maxval(self):
+    return 1
 
 class MoveTeleportSimulator(BaseSimulator):
   def __init__(self, **kwargs):
@@ -79,7 +85,7 @@ def __init__(self, **kwargs):
     #Manipulate radius
     self._manipulate_radius = 0.2
     #Image size
-    self._imSz = 64 
+    self._imSz = 32 
     self._im = np.zeros((self._imSz, self._imSz, 3), dtype=np.uint8)      
 
   def object_names(self):
@@ -148,6 +154,39 @@ def _setup_renderer(self):
   def render(self):
     self._canvas._display(self.get_image())
 
+class HalfCheetahSimulator(MoveTeleportSimulator):
+  def __init__(self, **kwargs):
+    super(HalfCheetahSimulator, self).__init__(**kwargs)
+    self.model = MjModel('rlmaster/envs/mujoco_envs/xmls/half_cheetah.xml')
+    self._pos = {}
+    self._pos['torso'] = np.zeros((3,))
+    self._range_min = -1
+    self._range_max = 1
+
+    self._imSz = 32
+    self._im = np.zeros((self._imSz, self._imSz, 3), dtype=np.uint8)
+    self.body_comvel = 0
+    self.action = np.zeros((1, 2))
+
+  @overrides
+  def step(self, ctrl):
+    self.model.data.ctrl = ctrl
+    for i in range(10):
+      self.model.step()
+    self.model.forward()
+    ind = self.model.body_names.index('torso')
+    self._pos['torso'] = self.model.body_pos[ind]
+    self.body_comvel = self.model.body_comvels[ind]
+    self.action = ctrl
+
+  @overrides 
+  def get_image(self):
+    imSz = self._imSz
+    rng = np.linspace(self._range_min, self._range_max, imSz)
+    g_x, g_y = self._get_bin(rng, self._pos['torso'])
+    self._im = np.zeros((imSz, imSz, 3), dtype=np.uint8)   
+    self._plot_object((g_x, g_y), 'r')
+    return self._im.copy()
   
 class InitFixed(BaseInitializer):
   @overrides
@@ -195,6 +234,13 @@ def observation(self):
     obs['im'] =  self.simulator.get_image()
     return obs
 
+class CheetahIm(ObsIm):
+  @overrides
+  def observation(self):
+    obs = {}
+    obs['im'] = self.simulator.get_image().flatten()
+    return obs
+
 
 class RewardSimple(BaseRewarder):
   #The radius around the goal in which reward is provided to the agent.
@@ -208,15 +254,32 @@ def get(self):
       return 1
     else:
       return 0 
+
+class RewardCheetah(BaseRewarder):
+
+  @property
+  def action(self):
+    return self.prms['sim'].action if hasattr(self.prms['sim'], 'action') else np.zeros((1,2))
+
+  @property
+  def body_comvel(self):
+    return self.prms['sim'].body_comvel if hasattr(self.prms['sim'], 'body_comvel') else 0
+
+  @overrides
+  def get(self):
+    ctrl_cost = 1e-1 * 0.5 * np.sum(np.square(self.action))
+    run_cost = -1 * self.body_comvel[0]
+    return ctrl_cost + run_cost
  
 
 def get_environment(initName='InitRandom', obsName='ObsIm', rewName='RewardSimple',
-                    actType='DiscreteActionFour', max_episode_length=100,
-                    initPrms={}, obsPrms={}, rewPrms={}, actPrms={}):
-
-  sim     = MoveTeleportSimulator()
+                    actType='DiscreteActionFour', simName='MoveTeleportSimulator',
+                    max_episode_length=100, initPrms={}, obsPrms={}, rewPrms={}, 
+                    actPrms={}):
+  sim     = globals()[simName]()
   initObj = globals()[initName](sim, initPrms)
   obsObj  = globals()[obsName](sim, obsPrms)
+  rewPrms = { 'sim': sim }
   rewObj  = globals()[rewName](sim, rewPrms)
   actObj  = globals()[actType](actPrms)
   env     = BaseEnvironment(sim, initObj, obsObj, rewObj, actObj, 
diff --git a/envs/mujoco_envs/xmls/half_cheetah.xml b/envs/mujoco_envs/xmls/half_cheetah.xml
new file mode 100644
index 0000000..5f7b25f
--- /dev/null
+++ b/envs/mujoco_envs/xmls/half_cheetah.xml
@@ -0,0 +1,95 @@
+<!-- Cheetah Model
+
+    The state space is populated with joints in the order that they are
+    defined in this file. The actuators also operate on joints.
+
+    State-Space (name/joint/parameter):
+        - rootx     slider      position (m)
+        - rootz     slider      position (m)
+        - rooty     hinge       angle (rad)
+        - bthigh    hinge       angle (rad)
+        - bshin     hinge       angle (rad)
+        - bfoot     hinge       angle (rad)
+        - fthigh    hinge       angle (rad)
+        - fshin     hinge       angle (rad)
+        - ffoot     hinge       angle (rad)
+        - rootx     slider      velocity (m/s)
+        - rootz     slider      velocity (m/s)
+        - rooty     hinge       angular velocity (rad/s)
+        - bthigh    hinge       angular velocity (rad/s)
+        - bshin     hinge       angular velocity (rad/s)
+        - bfoot     hinge       angular velocity (rad/s)
+        - fthigh    hinge       angular velocity (rad/s)
+        - fshin     hinge       angular velocity (rad/s)
+        - ffoot     hinge       angular velocity (rad/s)
+
+    Actuators (name/actuator/parameter):
+        - bthigh    hinge       torque (N m)
+        - bshin     hinge       torque (N m)
+        - bfoot     hinge       torque (N m)
+        - fthigh    hinge       torque (N m)
+        - fshin     hinge       torque (N m)
+        - ffoot     hinge       torque (N m)
+
+-->
+<mujoco model='cheetah'>
+  <compiler inertiafromgeom='true' coordinate='local' angle='radian' settotalmass='14' />
+  <default>
+    <joint limited='true' damping='.01' armature='.1' stiffness='8' solreflimit='.02 1' solimplimit='0 .8 .03' />
+    <geom contype='1' conaffinity='0' condim='3' friction='.4 .1 .1' rgba='0.8 0.6 .4 1' solimp='0.0 0.8 0.01' solref='0.02 1' />
+    <motor ctrlrange='-1 1' ctrllimited='true' />
+  </default>
+  <size nstack='300000' nuser_geom='1' />
+  <option timestep='0.01' gravity='0 0 -9.81' />
+  <asset>
+    <texture type="skybox" builtin="gradient" width="100" height="100" rgb1="1 1 1" rgb2="0 0 0" />
+    <texture name="texgeom" type="cube" builtin="flat" mark="cross" width="127" height="1278" rgb1="0.8 0.6 0.4" rgb2="0.8 0.6 0.4" markrgb="1 1 1" random="0.01" />
+    <texture name="texplane" type="2d" builtin="checker" rgb1="0 0 0" rgb2="0.8 0.8 0.8" width="100" height="100" />
+    <material name='MatPlane' texture="texplane" shininess="1" texrepeat="60 60" specular="1"  reflectance="0.5" />
+    <material name='geom' texture="texgeom" texuniform="true" />
+  </asset>
+  <worldbody>
+    <light directional="true" cutoff="100" exponent="1" diffuse="1 1 1" specular=".1 .1 .1" pos="0 0 1.3" dir="-0 0 -1.3" />
+    <geom name='floor' material="MatPlane" pos='0 0 0' size='40 40 40' type='plane' conaffinity='1' rgba='0.8 0.9 0.8 1' condim='3' />
+    <body name='torso' pos='0 0 .7'>
+      <joint name='rootx' type='slide' pos='0 0 0' axis='1 0 0' limited='false' damping='0' armature='0' stiffness='0' />
+      <joint name='rootz' type='slide' pos='0 0 0' axis='0 0 1' limited='false' damping='0' armature='0' stiffness='0' />
+      <joint name='rooty' type='hinge' pos='0 0 0' axis='0 1 0' limited='false' damping='0' armature='0' stiffness='0' />
+      <geom name='torso' type='capsule' fromto='-.5 0 0 .5 0 0' size='0.046' />
+      <geom name='head' type='capsule' pos='.6 0 .1' axisangle='0 1 0 .87' size='0.046 .15' />
+      <!-- <site name='tip'  pos='.15 0 .11'/> -->
+      <body name='bthigh' pos='-.5 0 0'>
+        <joint name='bthigh' type='hinge' pos='0 0 0' axis='0 1 0' range='-.52 1.05' stiffness='240' damping='6' />
+        <geom name='bthigh' type='capsule' pos='.1 0 -.13' axisangle='0 1 0 -3.8' size='0.046 .145' />
+        <body name='bshin' pos='.16 0 -.25'>
+          <joint name='bshin' type='hinge' pos='0 0 0' axis='0 1 0' range='-.785 .785' stiffness='180' damping='4.5' />
+          <geom name='bshin' type='capsule' pos='-.14 0 -.07' axisangle='0 1 0 -2.03' size='0.046 .15' rgba='0.9 0.6 0.6 1' />
+          <body name='bfoot' pos='-.28 0 -.14'>
+            <joint name='bfoot' type='hinge' pos='0 0 0' axis='0 1 0' range='-.4 .785' stiffness='120' damping='3' />
+            <geom name='bfoot' type='capsule' pos='.03 0 -.097' axisangle='0 1 0 -.27' size='0.046 .094' rgba='0.9 0.6 0.6 1' />
+          </body>
+        </body>
+      </body>
+      <body name='fthigh' pos='.5 0 0'>
+        <joint name='fthigh' type='hinge' pos='0 0 0' axis='0 1 0' range='-1 .7' stiffness='180' damping='4.5' />
+        <geom name='fthigh' type='capsule' pos='-.07 0 -.12' axisangle='0 1 0 .52' size='0.046 .133' />
+        <body name='fshin' pos='-.14 0 -.24'>
+          <joint name='fshin' type='hinge' pos='0 0 0' axis='0 1 0' range='-1.2 .87' stiffness='120' damping='3' />
+          <geom name='fshin' type='capsule' pos='.065 0 -.09' axisangle='0 1 0 -.6' size='0.046 .106' rgba='0.9 0.6 0.6 1' />
+          <body name='ffoot' pos='.13 0 -.18'>
+            <joint name='ffoot' type='hinge' pos='0 0 0' axis='0 1 0' range='-.5 .5' stiffness='60' damping='1.5' />
+            <geom name='ffoot' type='capsule' pos='.045 0 -.07' axisangle='0 1 0 -.6' size='0.046 .07' rgba='0.9 0.6 0.6 1' />
+          </body>
+        </body>
+      </body>
+    </body>
+  </worldbody>
+  <actuator>
+    <motor name='bthigh' joint='bthigh' gear='120' />
+    <motor name='bshin' joint='bshin' gear='90' />
+    <motor name='bfoot' joint='bfoot' gear='60' />
+    <motor name='fthigh' joint='fthigh' gear='120' />
+    <motor name='fshin' joint='fshin' gear='60' />
+    <motor name='ffoot' joint='ffoot' gear='30' />
+  </actuator>
+</mujoco>

From c032a98fe5b0d668b1266552783fb2b07b05cd62 Mon Sep 17 00:00:00 2001
From: JasmineDeng <jasmine.deng@gmail.com>
Date: Thu, 23 Mar 2017 18:00:10 -0700
Subject: [PATCH 02/10] moved cheetah and stacker to new files, created stacker
 environment

---
 envs/cheetah_agent.py | 59 ++++++++++++++++++++++++++++++
 envs/move_agent.py    | 61 ++-----------------------------
 envs/stacker_agent.py | 84 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 145 insertions(+), 59 deletions(-)
 create mode 100644 envs/cheetah_agent.py
 create mode 100644 envs/stacker_agent.py

diff --git a/envs/cheetah_agent.py b/envs/cheetah_agent.py
new file mode 100644
index 0000000..42a7805
--- /dev/null
+++ b/envs/cheetah_agent.py
@@ -0,0 +1,59 @@
+from move_agent import MoveTeleportSimulator, ObsIm, BaseRewarder
+
+class HalfCheetahSimulator(MoveTeleportSimulator):
+  def __init__(self, **kwargs):
+    super(HalfCheetahSimulator, self).__init__(**kwargs)
+    self.model = MjModel('../rlmaster/envs/mujoco_envs/xmls/half_cheetah.xml')
+    self._pos = {}
+    self._pos['torso'] = np.zeros((3,))
+    self._range_min = -1
+    self._range_max = 1
+
+    self._imSz = 32
+    self._im = np.zeros((self._imSz, self._imSz, 3), dtype=np.uint8)
+    self.body_comvel = 0
+    self.action = np.zeros((1, 2))
+
+  @overrides
+  def step(self, ctrl):
+    self.model.data.ctrl = ctrl
+    for i in range(10):
+      self.model.step()
+    self.model.forward()
+    ind = self.model.body_names.index('torso')
+    self._pos['torso'] = self.model.body_pos[ind]
+    self.body_comvel = self.model.body_comvels[ind]
+    self.action = ctrl
+
+  @overrides 
+  def get_image(self):
+    imSz = self._imSz
+    rng = np.linspace(self._range_min, self._range_max, imSz)
+    g_x, g_y = self._get_bin(rng, self._pos['torso'])
+    self._im = np.zeros((imSz, imSz, 3), dtype=np.uint8)   
+    self._plot_object((g_x, g_y), 'r')
+    return self._im.copy()
+
+class CheetahIm(ObsIm):
+  @overrides
+  def observation(self):
+    obs = {}
+    obs['im'] = self.simulator.get_image().flatten()
+    return obs
+
+class RewardCheetah(BaseRewarder):
+
+  @property
+  def action(self):
+    return self.prms['sim'].action if hasattr(self.prms['sim'], 'action') else np.zeros((1,2))
+
+  @property
+  def body_comvel(self):
+    return self.prms['sim'].body_comvel if hasattr(self.prms['sim'], 'body_comvel') else 0
+
+  @overrides
+  def get(self):
+    ctrl_cost = 1e-1 * 0.5 * np.sum(np.square(self.action))
+    run_cost = -1 * self.body_comvel[0]
+    return ctrl_cost + run_cost
+  
\ No newline at end of file
diff --git a/envs/move_agent.py b/envs/move_agent.py
index 01811cb..3f1c725 100644
--- a/envs/move_agent.py
+++ b/envs/move_agent.py
@@ -3,6 +3,8 @@
 from overrides import overrides
 from rllab.mujoco_py import MjModel
 
+from cheetah_agent import HalfCheetahSimulator, CheetahIm, RewardCheetah
+from stacker_agent import SimpleStackerSimulator, StackerIm, RewardStacker
 from pyhelper_fns import vis_utils
 
 def str2action(cmd):
@@ -154,40 +156,6 @@ def _setup_renderer(self):
   def render(self):
     self._canvas._display(self.get_image())
 
-class HalfCheetahSimulator(MoveTeleportSimulator):
-  def __init__(self, **kwargs):
-    super(HalfCheetahSimulator, self).__init__(**kwargs)
-    self.model = MjModel('rlmaster/envs/mujoco_envs/xmls/half_cheetah.xml')
-    self._pos = {}
-    self._pos['torso'] = np.zeros((3,))
-    self._range_min = -1
-    self._range_max = 1
-
-    self._imSz = 32
-    self._im = np.zeros((self._imSz, self._imSz, 3), dtype=np.uint8)
-    self.body_comvel = 0
-    self.action = np.zeros((1, 2))
-
-  @overrides
-  def step(self, ctrl):
-    self.model.data.ctrl = ctrl
-    for i in range(10):
-      self.model.step()
-    self.model.forward()
-    ind = self.model.body_names.index('torso')
-    self._pos['torso'] = self.model.body_pos[ind]
-    self.body_comvel = self.model.body_comvels[ind]
-    self.action = ctrl
-
-  @overrides 
-  def get_image(self):
-    imSz = self._imSz
-    rng = np.linspace(self._range_min, self._range_max, imSz)
-    g_x, g_y = self._get_bin(rng, self._pos['torso'])
-    self._im = np.zeros((imSz, imSz, 3), dtype=np.uint8)   
-    self._plot_object((g_x, g_y), 'r')
-    return self._im.copy()
-  
 class InitFixed(BaseInitializer):
   @overrides
   def sample_env_init(self):
@@ -204,7 +172,6 @@ def sample_env_init(self):
       self.simulator._pos[k] = range_mag * self.random.rand(2,) + \
                                self.simulator._range_min
 
-
 class ObsState(BaseObservation):
   @overrides
   def ndim(self):
@@ -234,14 +201,6 @@ def observation(self):
     obs['im'] =  self.simulator.get_image()
     return obs
 
-class CheetahIm(ObsIm):
-  @overrides
-  def observation(self):
-    obs = {}
-    obs['im'] = self.simulator.get_image().flatten()
-    return obs
-
-
 class RewardSimple(BaseRewarder):
   #The radius around the goal in which reward is provided to the agent.
   @property
@@ -255,22 +214,6 @@ def get(self):
     else:
       return 0 
 
-class RewardCheetah(BaseRewarder):
-
-  @property
-  def action(self):
-    return self.prms['sim'].action if hasattr(self.prms['sim'], 'action') else np.zeros((1,2))
-
-  @property
-  def body_comvel(self):
-    return self.prms['sim'].body_comvel if hasattr(self.prms['sim'], 'body_comvel') else 0
-
-  @overrides
-  def get(self):
-    ctrl_cost = 1e-1 * 0.5 * np.sum(np.square(self.action))
-    run_cost = -1 * self.body_comvel[0]
-    return ctrl_cost + run_cost
- 
 
 def get_environment(initName='InitRandom', obsName='ObsIm', rewName='RewardSimple',
                     actType='DiscreteActionFour', simName='MoveTeleportSimulator',
diff --git a/envs/stacker_agent.py b/envs/stacker_agent.py
new file mode 100644
index 0000000..fd00822
--- /dev/null
+++ b/envs/stacker_agent.py
@@ -0,0 +1,84 @@
+from move_agent import MoveTeleportSimulator, ObsIm, BaseRewarder
+
+class StackedBox():
+
+  def __init__(self, pos, width=3, height=3, length=3):
+    assert len(pos) == 3, 'must enter an xyz position'
+
+    self.pos = pos
+    self.width = width 
+    self.height = height
+    self.length = length
+
+  def contains(self, new_pos):
+    x, y, z = self.pos
+    new_x, new_y, new_z = new_pos
+    return new_x >= x and new_x <= x + self.width \
+      and new_y >= y and new_y <= y + self.height and new_z >= z and new_z <= z + self.length
+
+class SimpleStackerSimulator(MoveTeleportSimulator):
+  def __init__(self, **kwargs):
+    super(SimpleStackerSimulator, self).__init__(**kwargs)
+    self.width = self._imSz
+    self.height = self._imSz
+    self.block1 = StackedBox(np.random.randint(0, 32 - 3, size=3))
+    self.block1.pos[2] = 0
+    self.block2 = StackedBox(np.random.randint(0, 32 - 3, size=3))
+    self.block2.pos[2] = 0
+    if self.block1.contains(self.block2.pos):
+      self.block2.pos[2] = self.block1.height
+
+    self._imSz = 32
+    self._im = np.zeros((self._imSz, self._imSz, 3), dtype=np.uint8)
+    self._range_min = 0
+    self._range_max = 32 - self.block1.width
+
+  @overrides
+  def step(self, pos):
+    assert len(pos) == 2, 'step takes in an xy position'
+
+    x, y = min(pos[0], 32 - 3), min(pos[1], 32 - 3)
+    self.block2.pos = np.array([x, y, 0])
+    if self.block1.contains(self.block2.pos):
+      self.block2.pos[2] = self.block1.height
+
+  @overrides
+  def _plot_object(self, coords, color='r'):
+    x, y = coords
+    mnx, mxx  = x, min(self._imSz, x + self.block1.width)
+    mny, mxy  = y, min(self._imSz, y + self.block1.height)
+    if color == 'r':
+      self._im[mny:mxy, mnx:mxx, 0] = 255
+    elif color == 'g':
+      self._im[mny:mxy, mnx:mxx, 1] = 255
+    else:
+      self._im[mny:mxy, mnx:mxx, 2] = 255
+
+  @overrides
+  def get_image(self):
+    imSz = self._imSz
+    rng = np.linspace(self._range_min, self._range_max, imSz)
+    x_1, y_1 = self.block1.pos[0], self.block1.pos[1]
+    x_2, y_2 = self.block2.pos[0], self.block2.pos[1]
+    self._im = np.zeros((imSz, imSz, 3), dtype=np.uint8)
+    self._plot_object((x_1, y_1), 'r')
+    self._plot_object((x_2, y_2), 'g')
+    return self._im.copy()
+
+class StackerIm(ObsIm):
+  
+  @overrides
+  def observation(self):
+    obs = {}
+    obs['im'] = self.simulator.get_image().flatten()
+    return obs
+
+class RewardStacker(BaseRewarder):
+  
+  @property
+  def block_height(self):
+    return self.prms['sim'].block2.pos[2] if hasattr(self.prms['sim'], 'block2') else 0
+
+  @overrides
+  def get(self):
+    return self.block_height

From 8e7298f91435e025acd3278f2156f258d6d46a67 Mon Sep 17 00:00:00 2001
From: JasmineDeng <jasmine.deng@gmail.com>
Date: Fri, 24 Mar 2017 11:29:44 -0700
Subject: [PATCH 03/10] allowed rendering for stacker agent

---
 envs/move_agent.py | 144 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 142 insertions(+), 2 deletions(-)

diff --git a/envs/move_agent.py b/envs/move_agent.py
index 3f1c725..a016e9e 100644
--- a/envs/move_agent.py
+++ b/envs/move_agent.py
@@ -3,8 +3,6 @@
 from overrides import overrides
 from rllab.mujoco_py import MjModel
 
-from cheetah_agent import HalfCheetahSimulator, CheetahIm, RewardCheetah
-from stacker_agent import SimpleStackerSimulator, StackerIm, RewardStacker
 from pyhelper_fns import vis_utils
 
 def str2action(cmd):
@@ -214,6 +212,148 @@ def get(self):
     else:
       return 0 
 
+class StackedBox():
+
+  def __init__(self, pos, width=3, height=3, length=3):
+    assert len(pos) == 3, 'must enter an xyz position'
+
+    self.pos = pos
+    self.width = width 
+    self.height = height
+    self.length = length
+
+  def contains(self, new_pos):
+    x, y, z = self.pos
+    new_x, new_y, new_z = new_pos
+    return new_x >= x and new_x <= x + self.width \
+      and new_y >= y and new_y <= y + self.height and new_z >= z and new_z <= z + self.length
+
+class SimpleStackerSimulator(MoveTeleportSimulator):
+  def __init__(self, **kwargs):
+    super(SimpleStackerSimulator, self).__init__(**kwargs)
+    self.width = self._imSz
+    self.height = self._imSz
+    self.block1 = StackedBox(np.random.randint(0, 32 - 3, size=3))
+    self.block1.pos[2] = 0
+    self.block2 = StackedBox(np.random.randint(0, 32 - 3, size=3))
+    self.block2.pos[2] = 0
+    if self.block1.contains(self.block2.pos):
+      self.block2.pos[2] = self.block1.height
+
+    self._imSz = 32
+    self._im = np.zeros((self._imSz, self._imSz, 3), dtype=np.uint8)
+    self._range_min = 0
+    self._range_max = 32 - self.block1.width
+
+  @overrides
+  def step(self, pos):
+    assert len(pos) == 2, 'step takes in an xy position'
+
+    x, y = min(pos[0], 32 - 3), min(pos[1], 32 - 3)
+    self.block2.pos = np.array([x, y, 0])
+    if self.block1.contains(self.block2.pos):
+      self.block2.pos[2] = self.block1.height
+
+  @overrides
+  def _plot_object(self, coords, color='r'):
+    x, y = coords
+    mnx, mxx  = x, min(self._imSz, x + self.block1.width)
+    mny, mxy  = y, min(self._imSz, y + self.block1.height)
+    if color == 'r':
+      self._im[mny:mxy, mnx:mxx, 0] = 255
+    elif color == 'g':
+      self._im[mny:mxy, mnx:mxx, 1] = 255
+    else:
+      self._im[mny:mxy, mnx:mxx, 2] = 255
+
+  @overrides
+  def get_image(self):
+    imSz = self._imSz
+    rng = np.linspace(self._range_min, self._range_max, imSz)
+    x_1, y_1 = self.block1.pos[0], self.block1.pos[1]
+    x_2, y_2 = self.block2.pos[0], self.block2.pos[1]
+    self._im = np.zeros((imSz, imSz, 3), dtype=np.uint8)
+    self._plot_object((x_1, y_1), 'r')
+    self._plot_object((x_2, y_2), 'g')
+    return self._im.copy()
+
+class StackerIm(ObsIm):
+  
+  @overrides
+  def observation(self):
+    obs = {}
+    obs['im'] = self.simulator.get_image().flatten()
+    return obs
+
+class RewardStacker(BaseRewarder):
+  
+  @property
+  def block_height(self):
+    return self.prms['sim'].block2.pos[2] if hasattr(self.prms['sim'], 'block2') else 0
+
+  @overrides
+  def get(self):
+    return self.block_height
+
+from rlmaster.envs.move_agent import MoveTeleportSimulator, ObsIm, BaseRewarder
+
+class HalfCheetahSimulator(MoveTeleportSimulator):
+  def __init__(self, **kwargs):
+    super(HalfCheetahSimulator, self).__init__(**kwargs)
+    self.model = MjModel('../rlmaster/envs/mujoco_envs/xmls/half_cheetah.xml')
+    self._pos = {}
+    self._pos['torso'] = np.zeros((3,))
+    self._range_min = -1
+    self._range_max = 1
+
+    self._imSz = 32
+    self._im = np.zeros((self._imSz, self._imSz, 3), dtype=np.uint8)
+    self.body_comvel = 0
+    self.action = np.zeros((1, 2))
+
+  @overrides
+  def step(self, ctrl):
+    self.model.data.ctrl = ctrl
+    for i in range(10):
+      self.model.step()
+    self.model.forward()
+    ind = self.model.body_names.index('torso')
+    self._pos['torso'] = self.model.body_pos[ind]
+    self.body_comvel = self.model.body_comvels[ind]
+    self.action = ctrl
+
+  @overrides 
+  def get_image(self):
+    imSz = self._imSz
+    rng = np.linspace(self._range_min, self._range_max, imSz)
+    g_x, g_y = self._get_bin(rng, self._pos['torso'])
+    self._im = np.zeros((imSz, imSz, 3), dtype=np.uint8)   
+    self._plot_object((g_x, g_y), 'r')
+    return self._im.copy()
+
+class CheetahIm(ObsIm):
+  @overrides
+  def observation(self):
+    obs = {}
+    obs['im'] = self.simulator.get_image().flatten()
+    return obs
+
+class RewardCheetah(BaseRewarder):
+
+  @property
+  def action(self):
+    return self.prms['sim'].action if hasattr(self.prms['sim'], 'action') else np.zeros((1,2))
+
+  @property
+  def body_comvel(self):
+    return self.prms['sim'].body_comvel if hasattr(self.prms['sim'], 'body_comvel') else 0
+
+  @overrides
+  def get(self):
+    ctrl_cost = 1e-1 * 0.5 * np.sum(np.square(self.action))
+    run_cost = -1 * self.body_comvel[0]
+    return ctrl_cost + run_cost
+
 
 def get_environment(initName='InitRandom', obsName='ObsIm', rewName='RewardSimple',
                     actType='DiscreteActionFour', simName='MoveTeleportSimulator',

From 89567ea544a1c67db03c52d8d65f81b61d739863 Mon Sep 17 00:00:00 2001
From: Pulkit <pulkitag@berkeley.edu>
Date: Fri, 24 Mar 2017 12:27:37 -0700
Subject: [PATCH 04/10] Issues in code structure

---
 envs/stacker_agent.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/envs/stacker_agent.py b/envs/stacker_agent.py
index fd00822..aa2f94d 100644
--- a/envs/stacker_agent.py
+++ b/envs/stacker_agent.py
@@ -16,6 +16,8 @@ def contains(self, new_pos):
     return new_x >= x and new_x <= x + self.width \
       and new_y >= y and new_y <= y + self.height and new_z >= z and new_z <= z + self.length
 
+    
+### @pulkitag: This should inherit from BaseSimulator. Any reason for this? 
 class SimpleStackerSimulator(MoveTeleportSimulator):
   def __init__(self, **kwargs):
     super(SimpleStackerSimulator, self).__init__(**kwargs)
@@ -35,6 +37,8 @@ def __init__(self, **kwargs):
 
   @overrides
   def step(self, pos):
+    #####@pulkitag: You should make use of BaseDiscreteAction and BaseContinuousAction
+    #### to input the actions. 
     assert len(pos) == 2, 'step takes in an xy position'
 
     x, y = min(pos[0], 32 - 3), min(pos[1], 32 - 3)
@@ -65,14 +69,15 @@ def get_image(self):
     self._plot_object((x_2, y_2), 'g')
     return self._im.copy()
 
-class StackerIm(ObsIm):
-  
+####@pulkitag: Again, why are you inherting from ObsIm? 
+class StackerIm(ObsIm): 
   @overrides
   def observation(self):
     obs = {}
     obs['im'] = self.simulator.get_image().flatten()
     return obs
 
+####@pulkitag: This is good. 
 class RewardStacker(BaseRewarder):
   
   @property
@@ -82,3 +87,6 @@ def block_height(self):
   @overrides
   def get(self):
     return self.block_height
+  
+###@pulkitag: See get_environment in move_agent - you should have a similar function
+### so that it easy to instatiate a basic version of the environment. 

From 11958d75ba6c55f9e462dee7591677bbd9eac753 Mon Sep 17 00:00:00 2001
From: JasmineDeng <jasmine.deng@gmail.com>
Date: Fri, 24 Mar 2017 18:04:13 -0700
Subject: [PATCH 05/10] made cheetah and stacker into separate modules, fixed
 cheetah rendering

---
 core/gym_wrapper.py   |   1 -
 envs/cheetah_agent.py | 171 +++++++++++++++++++++++++++++-------------
 envs/move_agent.py    | 143 -----------------------------------
 envs/stacker_agent.py |  76 +++++++++++++++----
 4 files changed, 179 insertions(+), 212 deletions(-)

diff --git a/core/gym_wrapper.py b/core/gym_wrapper.py
index dd9d9d6..ebde5a8 100644
--- a/core/gym_wrapper.py
+++ b/core/gym_wrapper.py
@@ -79,5 +79,4 @@ def render(self):
 
 
   def _render(self):
-    print('i am rendering')
     return self.env.render() 
diff --git a/envs/cheetah_agent.py b/envs/cheetah_agent.py
index 42a7805..6ba9724 100644
--- a/envs/cheetah_agent.py
+++ b/envs/cheetah_agent.py
@@ -1,59 +1,122 @@
-from move_agent import MoveTeleportSimulator, ObsIm, BaseRewarder
-
-class HalfCheetahSimulator(MoveTeleportSimulator):
-  def __init__(self, **kwargs):
-    super(HalfCheetahSimulator, self).__init__(**kwargs)
-    self.model = MjModel('../rlmaster/envs/mujoco_envs/xmls/half_cheetah.xml')
-    self._pos = {}
-    self._pos['torso'] = np.zeros((3,))
-    self._range_min = -1
-    self._range_max = 1
-
-    self._imSz = 32
-    self._im = np.zeros((self._imSz, self._imSz, 3), dtype=np.uint8)
-    self.body_comvel = 0
-    self.action = np.zeros((1, 2))
-
-  @overrides
-  def step(self, ctrl):
-    self.model.data.ctrl = ctrl
-    for i in range(10):
-      self.model.step()
-    self.model.forward()
-    ind = self.model.body_names.index('torso')
-    self._pos['torso'] = self.model.body_pos[ind]
-    self.body_comvel = self.model.body_comvels[ind]
-    self.action = ctrl
-
-  @overrides 
-  def get_image(self):
-    imSz = self._imSz
-    rng = np.linspace(self._range_min, self._range_max, imSz)
-    g_x, g_y = self._get_bin(rng, self._pos['torso'])
-    self._im = np.zeros((imSz, imSz, 3), dtype=np.uint8)   
-    self._plot_object((g_x, g_y), 'r')
-    return self._im.copy()
-
-class CheetahIm(ObsIm):
-  @overrides
-  def observation(self):
-    obs = {}
-    obs['im'] = self.simulator.get_image().flatten()
-    return obs
+from rlmaster.core.base_environment import *
+import numpy as np
+from overrides import overrides
+from rllab.mujoco_py import MjModel, MjViewer
+
+class HalfCheetahSimulator(BaseSimulator):
+	def __init__(self, **kwargs):
+		super(HalfCheetahSimulator, self).__init__(**kwargs)
+
+		self._imSz = 512
+		self._im = np.zeros((self._imSz, self._imSz, 3), dtype=np.uint8)
+
+		self.model = MjModel('../rlmaster/envs/mujoco_envs/xmls/half_cheetah.xml')
+		self.viewer = None
+		self._pos = {}
+		self._pos['torso'] = np.zeros((3,))
+		self._range_min = -1
+		self._range_max = 1
+
+		self.body_comvel = 0
+		self.action = np.zeros((1, 2))
+
+		self.frame_skip = 1
+
+	@overrides
+	def step(self, ctrl, loop=False):
+		self.model.data.ctrl = ctrl + np.random.normal(size=ctrl.shape)
+		for i in range(self.frame_skip):
+		  self.model.step()
+		self.model.forward()
+		ind = self.model.body_names.index('torso')
+		self._pos['torso'] = self.model.body_pos[ind]
+		self.body_comvel = self.model.body_comvels[ind]
+		self.action = ctrl
+
+	@overrides 
+	def get_image(self):
+		data, width, height = self.viewer.get_image()
+		self._im = np.fromstring(data, dtype='uint8').reshape(height, width, 3)[::-1,:,:]
+		print(self._im)
+		return self._im.copy()
+
+	@overrides
+	def _setup_renderer(self):
+		self.viewer = MjViewer(visible=True, init_width=self._imSz, init_height=self._imSz)
+		self.viewer.start()
+		self.viewer.set_model(self.model)
+
+	@overrides
+	def render(self):
+  		self.viewer.loop_once()
+
+class CheetahIm(BaseObservation):
+
+	def get_body_com(self, body_name):
+		idx = self.simulator.model.body_names.index(body_name)
+		return self.simulator.model.data.com_subtree[idx]
+
+	@overrides
+	def ndim(self):
+		dim = {}
+		dim['im'] = (1, 20)
+		return dim
+
+	@overrides
+	def observation(self):
+		obs = {}
+		current_obs = np.concatenate([
+			self.simulator.model.data.qpos.flatten()[1:],
+			self.simulator.model.data.qvel.flat,
+			self.get_body_com('torso').flat,
+		])
+		obs['im'] = current_obs
+		return obs
 
 class RewardCheetah(BaseRewarder):
 
-  @property
-  def action(self):
-    return self.prms['sim'].action if hasattr(self.prms['sim'], 'action') else np.zeros((1,2))
+	@property
+	def action(self):
+		return self.prms['sim'].action if hasattr(self.prms['sim'], 'action') else np.zeros((1,2))
+
+	@property
+	def body_comvel(self):
+		return self.prms['sim'].body_comvel if hasattr(self.prms['sim'], 'body_comvel') else 0
+
+	@overrides
+	def get(self):
+		ctrl_cost = 1e-1 * 0.5 * np.sum(np.square(self.action))
+		run_cost = -1 * self.body_comvel[0]
+		return -(ctrl_cost + run_cost)
+
+class ContinuousCheetahAction(BaseContinuousAction):
+	@overrides
+	def action_dim(self):
+		return 6
+
+	def minval(self):
+		return -1.25
+
+	def maxval(self):
+		return 1.1
+
+	@overrides
+	def process(self, action):
+		return action
+
+class InitCheetah(BaseInitializer):
 
-  @property
-  def body_comvel(self):
-    return self.prms['sim'].body_comvel if hasattr(self.prms['sim'], 'body_comvel') else 0
+	@overrides
+	def sample_env_init(self):
+		pass
 
-  @overrides
-  def get(self):
-    ctrl_cost = 1e-1 * 0.5 * np.sum(np.square(self.action))
-    run_cost = -1 * self.body_comvel[0]
-    return ctrl_cost + run_cost
-  
\ No newline at end of file
+def get_environment(max_episode_length=100, initPrms={}, obsPrms={}, actPrms={}):
+	sim = HalfCheetahSimulator()
+	initObj = InitCheetah(sim, initPrms)
+	obsObj = CheetahIm(sim, obsPrms)
+	rewPrms = { 'sim': sim }
+	rewObj = RewardCheetah(sim, rewPrms)
+	actObj = ContinuousCheetahAction(actPrms)
+	env = BaseEnvironment(sim, initObj, obsObj, rewObj, actObj, 
+		params={'max_episode_length':max_episode_length})
+	return env
diff --git a/envs/move_agent.py b/envs/move_agent.py
index a016e9e..ae3f90a 100644
--- a/envs/move_agent.py
+++ b/envs/move_agent.py
@@ -212,149 +212,6 @@ def get(self):
     else:
       return 0 
 
-class StackedBox():
-
-  def __init__(self, pos, width=3, height=3, length=3):
-    assert len(pos) == 3, 'must enter an xyz position'
-
-    self.pos = pos
-    self.width = width 
-    self.height = height
-    self.length = length
-
-  def contains(self, new_pos):
-    x, y, z = self.pos
-    new_x, new_y, new_z = new_pos
-    return new_x >= x and new_x <= x + self.width \
-      and new_y >= y and new_y <= y + self.height and new_z >= z and new_z <= z + self.length
-
-class SimpleStackerSimulator(MoveTeleportSimulator):
-  def __init__(self, **kwargs):
-    super(SimpleStackerSimulator, self).__init__(**kwargs)
-    self.width = self._imSz
-    self.height = self._imSz
-    self.block1 = StackedBox(np.random.randint(0, 32 - 3, size=3))
-    self.block1.pos[2] = 0
-    self.block2 = StackedBox(np.random.randint(0, 32 - 3, size=3))
-    self.block2.pos[2] = 0
-    if self.block1.contains(self.block2.pos):
-      self.block2.pos[2] = self.block1.height
-
-    self._imSz = 32
-    self._im = np.zeros((self._imSz, self._imSz, 3), dtype=np.uint8)
-    self._range_min = 0
-    self._range_max = 32 - self.block1.width
-
-  @overrides
-  def step(self, pos):
-    assert len(pos) == 2, 'step takes in an xy position'
-
-    x, y = min(pos[0], 32 - 3), min(pos[1], 32 - 3)
-    self.block2.pos = np.array([x, y, 0])
-    if self.block1.contains(self.block2.pos):
-      self.block2.pos[2] = self.block1.height
-
-  @overrides
-  def _plot_object(self, coords, color='r'):
-    x, y = coords
-    mnx, mxx  = x, min(self._imSz, x + self.block1.width)
-    mny, mxy  = y, min(self._imSz, y + self.block1.height)
-    if color == 'r':
-      self._im[mny:mxy, mnx:mxx, 0] = 255
-    elif color == 'g':
-      self._im[mny:mxy, mnx:mxx, 1] = 255
-    else:
-      self._im[mny:mxy, mnx:mxx, 2] = 255
-
-  @overrides
-  def get_image(self):
-    imSz = self._imSz
-    rng = np.linspace(self._range_min, self._range_max, imSz)
-    x_1, y_1 = self.block1.pos[0], self.block1.pos[1]
-    x_2, y_2 = self.block2.pos[0], self.block2.pos[1]
-    self._im = np.zeros((imSz, imSz, 3), dtype=np.uint8)
-    self._plot_object((x_1, y_1), 'r')
-    self._plot_object((x_2, y_2), 'g')
-    return self._im.copy()
-
-class StackerIm(ObsIm):
-  
-  @overrides
-  def observation(self):
-    obs = {}
-    obs['im'] = self.simulator.get_image().flatten()
-    return obs
-
-class RewardStacker(BaseRewarder):
-  
-  @property
-  def block_height(self):
-    return self.prms['sim'].block2.pos[2] if hasattr(self.prms['sim'], 'block2') else 0
-
-  @overrides
-  def get(self):
-    return self.block_height
-
-from rlmaster.envs.move_agent import MoveTeleportSimulator, ObsIm, BaseRewarder
-
-class HalfCheetahSimulator(MoveTeleportSimulator):
-  def __init__(self, **kwargs):
-    super(HalfCheetahSimulator, self).__init__(**kwargs)
-    self.model = MjModel('../rlmaster/envs/mujoco_envs/xmls/half_cheetah.xml')
-    self._pos = {}
-    self._pos['torso'] = np.zeros((3,))
-    self._range_min = -1
-    self._range_max = 1
-
-    self._imSz = 32
-    self._im = np.zeros((self._imSz, self._imSz, 3), dtype=np.uint8)
-    self.body_comvel = 0
-    self.action = np.zeros((1, 2))
-
-  @overrides
-  def step(self, ctrl):
-    self.model.data.ctrl = ctrl
-    for i in range(10):
-      self.model.step()
-    self.model.forward()
-    ind = self.model.body_names.index('torso')
-    self._pos['torso'] = self.model.body_pos[ind]
-    self.body_comvel = self.model.body_comvels[ind]
-    self.action = ctrl
-
-  @overrides 
-  def get_image(self):
-    imSz = self._imSz
-    rng = np.linspace(self._range_min, self._range_max, imSz)
-    g_x, g_y = self._get_bin(rng, self._pos['torso'])
-    self._im = np.zeros((imSz, imSz, 3), dtype=np.uint8)   
-    self._plot_object((g_x, g_y), 'r')
-    return self._im.copy()
-
-class CheetahIm(ObsIm):
-  @overrides
-  def observation(self):
-    obs = {}
-    obs['im'] = self.simulator.get_image().flatten()
-    return obs
-
-class RewardCheetah(BaseRewarder):
-
-  @property
-  def action(self):
-    return self.prms['sim'].action if hasattr(self.prms['sim'], 'action') else np.zeros((1,2))
-
-  @property
-  def body_comvel(self):
-    return self.prms['sim'].body_comvel if hasattr(self.prms['sim'], 'body_comvel') else 0
-
-  @overrides
-  def get(self):
-    ctrl_cost = 1e-1 * 0.5 * np.sum(np.square(self.action))
-    run_cost = -1 * self.body_comvel[0]
-    return ctrl_cost + run_cost
-
-
 def get_environment(initName='InitRandom', obsName='ObsIm', rewName='RewardSimple',
                     actType='DiscreteActionFour', simName='MoveTeleportSimulator',
                     max_episode_length=100, initPrms={}, obsPrms={}, rewPrms={}, 
diff --git a/envs/stacker_agent.py b/envs/stacker_agent.py
index fd00822..0a53308 100644
--- a/envs/stacker_agent.py
+++ b/envs/stacker_agent.py
@@ -1,4 +1,7 @@
-from move_agent import MoveTeleportSimulator, ObsIm, BaseRewarder
+from rlmaster.core.base_environment import *
+import numpy as np
+from overrides import overrides
+from pyhelper_fns import vis_utils
 
 class StackedBox():
 
@@ -16,19 +19,15 @@ def contains(self, new_pos):
     return new_x >= x and new_x <= x + self.width \
       and new_y >= y and new_y <= y + self.height and new_z >= z and new_z <= z + self.length
 
-class SimpleStackerSimulator(MoveTeleportSimulator):
+class SimpleStackerSimulator(BaseSimulator):
   def __init__(self, **kwargs):
     super(SimpleStackerSimulator, self).__init__(**kwargs)
+    self._imSz = 32
     self.width = self._imSz
     self.height = self._imSz
-    self.block1 = StackedBox(np.random.randint(0, 32 - 3, size=3))
-    self.block1.pos[2] = 0
-    self.block2 = StackedBox(np.random.randint(0, 32 - 3, size=3))
-    self.block2.pos[2] = 0
-    if self.block1.contains(self.block2.pos):
-      self.block2.pos[2] = self.block1.height
+    self.block1 = StackedBox(np.zeros(3))
+    self.block2 = StackedBox(np.zeros(3))
 
-    self._imSz = 32
     self._im = np.zeros((self._imSz, self._imSz, 3), dtype=np.uint8)
     self._range_min = 0
     self._range_max = 32 - self.block1.width
@@ -42,11 +41,10 @@ def step(self, pos):
     if self.block1.contains(self.block2.pos):
       self.block2.pos[2] = self.block1.height
 
-  @overrides
   def _plot_object(self, coords, color='r'):
     x, y = coords
-    mnx, mxx  = x, min(self._imSz, x + self.block1.width)
-    mny, mxy  = y, min(self._imSz, y + self.block1.height)
+    mnx, mxx  = int(x), int(min(self._imSz, x + self.block1.width))
+    mny, mxy  = int(y), int(min(self._imSz, y + self.block1.height))
     if color == 'r':
       self._im[mny:mxy, mnx:mxx, 0] = 255
     elif color == 'g':
@@ -65,12 +63,26 @@ def get_image(self):
     self._plot_object((x_2, y_2), 'g')
     return self._im.copy()
 
-class StackerIm(ObsIm):
+  @overrides 
+  def _setup_renderer(self):
+    self._canvas = vis_utils.MyAnimation(None, height=self._imSz, width=self._imSz)
+
+  @overrides
+  def render(self):
+    self._canvas._display(self.get_image())
+
+class StackerIm(BaseObservation):
+
+  @overrides
+  def ndim(self):
+    dim = {}
+    dim['im'] = (self.simulator._imSz, self.simulator._imSz, 3)
+    return dim
   
   @overrides
   def observation(self):
     obs = {}
-    obs['im'] = self.simulator.get_image().flatten()
+    obs['im'] = self.simulator.get_image()
     return obs
 
 class RewardStacker(BaseRewarder):
@@ -82,3 +94,39 @@ def block_height(self):
   @overrides
   def get(self):
     return self.block_height
+
+class ContinuousStackerAction(BaseContinuousAction):
+  @overrides
+  def action_dim(self):
+    return 2
+
+  @overrides
+  def process(self, action):
+    return np.around(action, decimals=0)
+
+  def minval(self):
+    return 0
+
+  def maxval(self):
+    return 32
+
+class InitStacker(BaseInitializer):
+  @overrides
+  def sample_env_init(self):
+    self.simulator.block1.pos = np.random.randint(0, 32 - 3, size=3)
+    self.simulator.block1.pos[2] = 0
+    self.simulator.block2.pos = np.random.randint(0, 32 - 3, size=3)
+    self.simulator.block2.pos[2] = 0
+    if self.simulator.block1.contains(self.simulator.block2.pos):
+      self.simulator.block2.pos[2] = self.simulator.block1.height
+
+def get_environment(max_episode_length=100, initPrms={}, obsPrms={}, rewPrms={}, actPrms={}):
+  sim = SimpleStackerSimulator()
+  initObj = InitStacker(initPrms)
+  obsObj = StackerIm(sim, obsPrms)
+  rewPrms = { 'sim': sim }
+  rewObj = RewardStacker(sim, rewPrms)
+  actObj = ContinuousStackerAction(actPrms)
+  env = BaseEnvironment(sim, initObj, obsObj, rewObj, actObj,
+    params={'max_episode_length':max_episode_length})
+  return env

From 3e94883fca10bf3d0d7cfbe8a32b1b99da030983 Mon Sep 17 00:00:00 2001
From: JasmineDeng <jasmine.deng@gmail.com>
Date: Sun, 26 Mar 2017 03:11:55 -0700
Subject: [PATCH 06/10] initialized cheetah correctly, modified README

---
 README.md             | 10 ++++++----
 envs/cheetah_agent.py | 14 ++++++++++++--
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index dd1fa7c..3e58f48 100644
--- a/README.md
+++ b/README.md
@@ -20,8 +20,9 @@ An environment is defined by:
 - ActionProcessor: an object of `type BaseAction` that defines functions that process the action. Generally simulator works
                    by applying continous forces/torque, however the desired action space might be discrete or continuous. 
                    `BaseAction` objects allow the user to easily switch between different action spaces. 
-                   
+
 ### Interactive Mode
+
 To run environments in interactive mode, a function mapping the keyboard inputs into commands supplied to the agent must be defined. This function is typically called, `str2action` in `rlmaster` repository. The environment can be run in interactive mode in the following way:
 
 ```
@@ -31,7 +32,9 @@ env.interactive(move_single_env.str2action)
 ```
 You can use the commands, `w, s, d, a` to move the agent and `q` to quit the interactive mode.
 
-                   
+### Cheetah in Interactive Mode
+
+No interface has been developed yet, but after creating an instance of SimpleStackerAgent in from `stacker_agent`, calling `_setup_renderer` and `render` will open a display window. You can step by entering an xy position that will move the second block there instantly. Calling `render` again will show the updated positions of the blocks. They are initialized at position (0,0) if not called with the initializer.
 
 ##Environment in openAI gym format
 
@@ -41,8 +44,7 @@ from core import gym_wrapper
 env = move_agent.get_environment()
 gymEnv = gym_wwapper.GymWrapper(env)
 ```
-                   
-            
+                               
 
 
 
diff --git a/envs/cheetah_agent.py b/envs/cheetah_agent.py
index ca20471..973da33 100644
--- a/envs/cheetah_agent.py
+++ b/envs/cheetah_agent.py
@@ -19,7 +19,10 @@ def __init__(self, **kwargs):
 
 		self.body_comvel = 0
 		self.action = np.zeros((1, 2))
-
+		self.init_qpos = self.model.data.qpos
+		self.init_qvel = self.model.data.qvel 
+		self.init_qacc = self.model.data.qacc 
+		self.init_ctrl = self.model.data.ctrl
 		self.frame_skip = 1
 
 	@overrides
@@ -107,9 +110,16 @@ def process(self, action):
 # TODO(jasmine): fix this so it initializes (is it called at the start of each epoch? find out)
 class InitCheetah(BaseInitializer):
 
+	def reset_mujoco(self):
+		self.simulator.model.data.qpos = self.simulator.init_qpos
+		self.simulator.model.data.qvel = self.simulator.init_qvel
+		self.simulator.model.data.qacc = self.simulator.init_qacc
+		self.simulator.model.data.ctrl = self.simulator.init_ctrl
+
 	@overrides
 	def sample_env_init(self):
-		pass
+		self.reset_mujoco()
+		self.simulator.model.forward()
 
 def get_environment(max_episode_length=100, initPrms={}, obsPrms={}, actPrms={}):
 	sim = HalfCheetahSimulator()

From 1625572f0c3a5c9485b4f5829cc87fd3750ffb58 Mon Sep 17 00:00:00 2001
From: JasmineDeng <jasmine.deng@gmail.com>
Date: Fri, 7 Apr 2017 18:13:59 -0700
Subject: [PATCH 07/10] working stacker environment

---
 envs/cheetah_agent.py |  10 ++-
 envs/move_agent.py    |   6 +-
 envs/reacher_agent.py | 196 ++++++++++++++++++++++++++++++++++++++++++
 envs/stacker_agent.py | 114 +++++++++++++++---------
 4 files changed, 278 insertions(+), 48 deletions(-)
 create mode 100644 envs/reacher_agent.py

diff --git a/envs/cheetah_agent.py b/envs/cheetah_agent.py
index 973da33..cc2351b 100644
--- a/envs/cheetah_agent.py
+++ b/envs/cheetah_agent.py
@@ -27,7 +27,9 @@ def __init__(self, **kwargs):
 
 	@overrides
 	def step(self, ctrl, loop=False):
-		self.model.data.ctrl = ctrl + np.random.normal(size=ctrl.shape)
+		ctrl = np.clip(ctrl, *(np.array([-1, -1, -1, -1, -1, -1]), np.array([1, 1, 1, 1, 1, 1])))
+		self.model.data.ctrl = ctrl # + np.random.normal(size=ctrl.shape)
+		# print('gym', self.model.data.ctrl)
 		for i in range(self.frame_skip):
 		  self.model.step()
 		self.model.forward()
@@ -40,7 +42,7 @@ def step(self, ctrl, loop=False):
 	def get_image(self):
 		data, width, height = self.viewer.get_image()
 		self._im = np.fromstring(data, dtype='uint8').reshape(height, width, 3)[::-1,:,:]
-		print(self._im)
+		# print(self._im)
 		return self._im.copy()
 
 	@overrides
@@ -111,8 +113,8 @@ def process(self, action):
 class InitCheetah(BaseInitializer):
 
 	def reset_mujoco(self):
-		self.simulator.model.data.qpos = self.simulator.init_qpos
-		self.simulator.model.data.qvel = self.simulator.init_qvel
+		self.simulator.model.data.qpos = self.simulator.init_qpos + 0.01 * np.random.normal(size=self.simulator.init_qpos.shape)
+		self.simulator.model.data.qvel = self.simulator.init_qvel + 0.1 * np.random.normal(size=self.simulator.init_qvel.shape)
 		self.simulator.model.data.qacc = self.simulator.init_qacc
 		self.simulator.model.data.ctrl = self.simulator.init_ctrl
 
diff --git a/envs/move_agent.py b/envs/move_agent.py
index ae3f90a..e8c78ff 100644
--- a/envs/move_agent.py
+++ b/envs/move_agent.py
@@ -184,19 +184,19 @@ def observation(self):
     for i, k in enumerate(self.simulator._pos.keys()):
       obs[2*i, 2*i + 2] = self.simulator._pos[k].copy()
     return obs
-  
  
 class ObsIm(BaseObservation):
   @overrides
   def ndim(self):
     dim = {}
-    dim['im'] = (self.simulator._imSz, self.simulator._imSz, 3)
+    # dim['im'] = (self.simulator._imSz, self.simulator._imSz, 3)
+    dim['im'] = (3072, 1)
     return dim
 
   @overrides
   def observation(self):
     obs = {}
-    obs['im'] =  self.simulator.get_image()
+    obs['im'] =  self.simulator.get_image().flatten()
     return obs
 
 class RewardSimple(BaseRewarder):
diff --git a/envs/reacher_agent.py b/envs/reacher_agent.py
new file mode 100644
index 0000000..4789b4a
--- /dev/null
+++ b/envs/reacher_agent.py
@@ -0,0 +1,196 @@
+from rlmaster.core.base_environment import *
+import numpy as np
+from overrides import overrides
+from pyhelper_fns import vis_utils
+
+def contains(obj1, obj2):
+  x1, y1, _ = obj1.pos.tolist()
+  x2, y2, _ = obj2.pos.tolist()
+  s1, s2 = obj1.size, obj2.size
+  if x2 + w2 <= x1 or x1 + w1 <= x2:
+    return False
+  if y2 + l2 <= y1 or y1 + l1 <= y2:
+    return False
+  return True
+
+class StackedBox():
+
+  def __init__(self, pos, size=3):
+    assert len(pos) == 3, 'must enter an xyz position'
+
+    self.pos = pos.reshape((3,))
+    # height is the height in 3d, box in 2d is width x length
+    self.size = size
+
+class ReacherArm():
+
+  def __init__(self, pos, size=4):
+    # takes in xyz position, but z is implicitly infinity
+    self.pos = pos.reshape((2,))
+    self.size = size
+
+    self.hold_block = False
+
+class SimpleReacherSimulator(BaseSimulator):
+  def __init__(self, **kwargs):
+    # should always have same size blocks (width = height = length = n)
+    super(SimpleStackerSimulator, self).__init__(**kwargs)
+    self._imSz = 32
+    self.width = self._imSz
+    self.height = self._imSz
+    self.move_block = StackedBox(np.zeros(3))
+    self.other_block = StackedBox(np.zeros(3))
+
+    self.reacher_arm = ReacherArm(np.zeros(3))
+
+    self._im = np.zeros((self._imSz, self._imSz, 3), dtype=np.uint8)
+    self._range_min = 0
+    self._range_max = 32 - self.move_block.size
+
+  @overrides
+  def step(self, action):
+    # should take in xyz position, but z position doesn't matter
+    action = action.reshape((5,))
+    pos = action[:3]
+    grab_act = action[3] < 4
+    drop_act = action[4] < 4
+    print(grab_act, drop_act)
+    
+    if grab_act: 
+      if contains(self.reacher_arm, self.move_block):
+        self.reacher_arm.hold_block = True
+
+    elif drop_act:
+      if self.reacher_arm.hold_block:
+        self.reacher_arm.hold_block = False
+        self.move_block.pos[2] = 0
+        if contains(self.other_block, self.move_block):
+          self.move_block.pos[2] = self.other_block.size
+
+    if self.reacher_arm.hold_block:
+      delta = pos[0] - self.reacher_arm.pos[0], pos[1] - self.reacher_arm.pos[1]
+      self.move_block.pos = np.array([self.move_block.pos[0] + delta[0], 
+              self.move_block.pos[1] + delta[1], 10]) # to preserve relative position
+
+    self.reacher_arm.pos = pos
+
+  def _plot_object(self, coords, box, color='r'):
+    x, y = coords
+    width, length = box[0], box[1]
+    mnx, mxx  = int(x), int(min(self._imSz, x + width))
+    mny, mxy  = int(y), int(min(self._imSz, y + length))
+    if color == 'r':
+      self._im[mny:mxy, mnx:mxx, 0] = 255
+    elif color == 'g':
+      self._im[mny:mxy, mnx:mxx, 1] = 255
+    else:
+      self._im[mny:mxy, mnx:mxx, 2] = 255
+
+  @overrides
+  def get_image(self):
+    # other blocks in red, active block in green
+    imSz = self._imSz
+    self._im = np.zeros((imSz, imSz, 3), dtype=np.uint8)
+    x, y = self.other_block.pos[0], self.other_block.pos[1]
+    self._plot_object((x, y), (3, 3), 'r')
+    x, y = self.move_block.pos[0], self.move_block.pos[1]
+    self._plot_object((x, y), (3, 3), 'g')
+    x, y = self.reacher_arm.pos[0], self.reacher_arm.pos[1]
+    self._plot_object((x, y), (4, 4), 'b')
+    return self._im.copy()
+
+  @overrides 
+  def _setup_renderer(self):
+    self._canvas = vis_utils.MyAnimation(None, height=self._imSz, width=self._imSz)
+
+  @overrides
+  def render(self):
+    self._canvas._display(self.get_image())
+
+class StackerIm(BaseObservation):
+
+  @overrides
+  def ndim(self):
+    dim = {}
+    dim['im'] = (self.simulator._imSz, self.simulator._imSz, 3)
+    return dim
+
+  @overrides
+  def observation(self):
+    obs = {}
+    obs['im'] = self.simulator.get_image()
+    return obs
+
+class StateIm(BaseObservation):
+
+  @overrides
+  def ndim(self):
+    dim = {}
+    dim['im'] = (9, 1)
+    return dim
+
+  @overrides
+  def observation(self):
+    obs = {}
+    new_im = np.append(self.simulator.move_block.pos, self.simulator.other_block.pos)
+    new_im = np.append(new_im, self.simulator.reacher_arm.pos[:2])
+    new_im = np.append(new_im, np.array([1 if self.simulator.reacher_arm.hold_block else 0]))
+    obs['im'] = new_im
+    return obs
+
+####@pulkitag: This is good. 
+class RewardReacher(BaseRewarder):
+  
+  @property
+  def contains_block(self):
+    if hasattr(self.prms['sim'], 'reacher_arm') and hasattr(self.prms['sim'], 'move_block') and hasattr(self.prms['sim'], 'other_block'):
+      move_block = self.prms['sim'].move_block0,  0,  0,
+      other_block = self.prms['sim'].other_block
+      reacher_arm = self.prms['sim'].reacher_arm
+      return 1 if move_block.pos[2] == other_block.size and not reacher_arm.hold_block else 0
+    return 0
+
+  @overrides
+  def get(self):
+    return self.contains_block
+
+# need additional actions: to pick up and to drop the block
+class ContinuousReacherAction(BaseContinuousAction):
+  @overrides
+  def action_dim(self):
+    return 5
+
+  @overrides
+  def process(self, action):
+    return np.around(action, decimals=0)
+
+  def minval(self):
+    return 0
+
+  def maxval(self):
+    return 32
+
+class InitReacher(BaseInitializer):
+  @overrides
+  def sample_env_init(self):
+    sim = self.simulator['sim']
+    sim.reacher_arm.pos = np.random.randint(0, 32 - sim.reacher_arm.size, size=3)
+    sim.reacher_arm.hold_block = False
+    sim.move_block.pos = np.random.randint(0, 32 - sim.move_block.size, size=3)
+    sim.move_block.pos[2] = 0
+    sim.other_block.pos = np.random.randint(0, 32 - sim.move_block.size, size=3)
+    sim.other_block.pos[2] = 0
+    if contains(sim.other_block, sim.move_block):
+      sim.move_block.pos[2] = sim.other_block.size
+
+def get_environment(obsType='StateIm', max_episode_length=100, initPrms={}, obsPrms={}, rewPrms={}, actPrms={}):
+  sim = SimpleReacherSimulator()
+  initPrms = {'sim' : sim}
+  initObj = InitReacher(initPrms)
+  obsObj = globals()[obsType](sim, obsPrms)
+  rewPrms = { 'sim': sim }
+  rewObj = RewardReacher(sim, rewPrms)
+  actObj = ContinuousReacherAction(actPrms)
+  env = BaseEnvironment(sim, initObj, obsObj, rewObj, actObj,
+    params={'max_episode_length':max_episode_length})
+  return env
diff --git a/envs/stacker_agent.py b/envs/stacker_agent.py
index 9b5a1d5..d97c84f 100644
--- a/envs/stacker_agent.py
+++ b/envs/stacker_agent.py
@@ -3,21 +3,22 @@
 from overrides import overrides
 from pyhelper_fns import vis_utils
 
-class StackedBox():
-
-  def __init__(self, pos, width=3, height=3, length=3):
-    assert len(pos) == 3, 'must enter an xyz position'
+def contains(obj1, obj2):
+  x1, y1, _ = obj1.pos.tolist()
+  x2, y2, _ = obj2.pos.tolist()
+  s1, s2 = obj1.size, obj2.size
+  if x2 + s2 <= x1 or x1 + s1 <= x2:
+    return False
+  if y2 + s2 <= y1 or y1 + s1 <= y2:
+    return False
+  return True
 
-    self.pos = pos
-    self.width = width 
-    self.height = height
-    self.length = length
+class StackedBox():
 
-  def contains(self, new_pos):
-    x, y, z = self.pos
-    new_x, new_y, new_z = new_pos
-    return new_x >= x and new_x <= x + self.width \
-      and new_y >= y and new_y <= y + self.height and new_z >= z and new_z <= z + self.length
+  def __init__(self, pos, size=2):
+    # position is center of the box
+    self.pos = pos.reshape((3,))
+    self.size = size
 
 class SimpleStackerSimulator(BaseSimulator):
   def __init__(self, **kwargs):
@@ -25,28 +26,30 @@ def __init__(self, **kwargs):
     self._imSz = 32
     self.width = self._imSz
     self.height = self._imSz
-    self.block1 = StackedBox(np.zeros(3))
-    self.block2 = StackedBox(np.zeros(3))
+    self.move_block = StackedBox(np.zeros(3))
+    self.other_block = StackedBox(np.zeros(3))
 
     self._im = np.zeros((self._imSz, self._imSz, 3), dtype=np.uint8)
-    self._range_min = 0
-    self._range_max = 32 - self.block1.width
+    self._range_min = self.move_block.size
+    self._range_max = self._imSz - self.move_block.size
 
   @overrides
   def step(self, pos):
     #####@pulkitag: You should make use of BaseDiscreteAction and BaseContinuousAction
     #### to input the actions. 
-    assert len(pos) == 2, 'step takes in an xy position'
-
-    x, y = min(pos[0], 32 - 3), min(pos[1], 32 - 3)
-    self.block2.pos = np.array([x, y, 0])
-    if self.block1.contains(self.block2.pos):
-      self.block2.pos[2] = self.block1.height
+    pos = pos.reshape((2,))
+    pos = np.append(pos, np.zeros(1))
+    
+    new_pos = np.clip(self.move_block.pos + pos, self._range_min, self._range_max)
+    new_pos[2] = 0
+    self.move_block.pos = new_pos
+    if contains(self.move_block, self.other_block):
+      self.move_block.pos[2] = self.other_block.size
 
   def _plot_object(self, coords, color='r'):
     x, y = coords
-    mnx, mxx  = int(x), int(min(self._imSz, x + self.block1.width))
-    mny, mxy  = int(y), int(min(self._imSz, y + self.block1.height))
+    mnx, mxx  = int(x), int(min(self._imSz, x + self.move_block.size))
+    mny, mxy  = int(y), int(min(self._imSz, y + self.move_block.size))
     if color == 'r':
       self._im[mny:mxy, mnx:mxx, 0] = 255
     elif color == 'g':
@@ -58,8 +61,8 @@ def _plot_object(self, coords, color='r'):
   def get_image(self):
     imSz = self._imSz
     rng = np.linspace(self._range_min, self._range_max, imSz)
-    x_1, y_1 = self.block1.pos[0], self.block1.pos[1]
-    x_2, y_2 = self.block2.pos[0], self.block2.pos[1]
+    x_1, y_1 = self.other_block.pos[0], self.other_block.pos[1]
+    x_2, y_2 = self.move_block.pos[0], self.move_block.pos[1]
     self._im = np.zeros((imSz, imSz, 3), dtype=np.uint8)
     self._plot_object((x_1, y_1), 'r')
     self._plot_object((x_2, y_2), 'g')
@@ -87,12 +90,38 @@ def observation(self):
     obs['im'] = self.simulator.get_image()
     return obs
 
+class StateIm(BaseObservation):
+
+  @overrides
+  def ndim(self):
+    dim = {}
+    dim['im'] = (6, 1)
+    return dim
+
+  def scale(self, obs):
+    norm = np.linalg.norm(obs)
+    obs = np.copy(obs)
+    obs = obs / 16 - 1
+    return obs
+
+  @overrides
+  def observation(self):
+    obs = {}
+    m_pos = self.simulator.move_block.pos
+    o_pos = self.simulator.other_block.pos
+    new_im = np.append(self.scale(m_pos), self.scale(o_pos))
+    obs['im'] = new_im
+    return obs
+
+
 ####@pulkitag: This is good. 
 class RewardStacker(BaseRewarder):
   
   @property
   def block_height(self):
-    return self.prms['sim'].block2.pos[2] if hasattr(self.prms['sim'], 'block2') else 0
+    if hasattr(self.prms['sim'], 'other_block') and hasattr(self.prms['sim'], 'move_block'):
+      return 1 if self.prms['sim'].move_block.pos[2] == self.prms['sim'].other_block.size else 0
+    return 0
 
   @overrides
   def get(self):
@@ -105,29 +134,32 @@ def action_dim(self):
 
   @overrides
   def process(self, action):
-    return np.around(action, decimals=0)
+    return action
 
   def minval(self):
-    return 0
+    return -1
 
   def maxval(self):
-    return 32
+    return 1
 
 class InitStacker(BaseInitializer):
   @overrides
   def sample_env_init(self):
-    self.simulator.block1.pos = np.random.randint(0, 32 - 3, size=3)
-    self.simulator.block1.pos[2] = 0
-    self.simulator.block2.pos = np.random.randint(0, 32 - 3, size=3)
-    self.simulator.block2.pos[2] = 0
-    if self.simulator.block1.contains(self.simulator.block2.pos):
-      self.simulator.block2.pos[2] = self.simulator.block1.height
-
-def get_environment(max_episode_length=100, initPrms={}, obsPrms={}, rewPrms={}, actPrms={}):
+    sim = self.simulator['sim']
+    size = sim.move_block.size
+    sim.move_block.pos = np.random.randint(sim._range_min, sim._range_max, size=3)
+    sim.move_block.pos[2] = 0
+    sim.other_block.pos = np.random.randint(sim._range_min, sim._range_max, size=3)
+    sim.other_block.pos[2] = 0
+    if contains(sim.move_block, sim.other_block):
+      sim.move_block.pos[2] = size
+
+def get_environment(obsType='StateIm', max_episode_length=100, initPrms={}, obsPrms={}, rewPrms={}, actPrms={}):
   sim = SimpleStackerSimulator()
-  initObj = InitStacker(initPrms)
-  obsObj = StackerIm(sim, obsPrms)
   rewPrms = { 'sim': sim }
+  initPrms = { 'sim': sim }
+  initObj = InitStacker(initPrms)
+  obsObj = globals()[obsType](sim, obsPrms)
   rewObj = RewardStacker(sim, rewPrms)
   actObj = ContinuousStackerAction(actPrms)
   env = BaseEnvironment(sim, initObj, obsObj, rewObj, actObj,

From 4ce904bdbfde0ad4e3af5aa479b85829b97a0b46 Mon Sep 17 00:00:00 2001
From: JasmineDeng <jasmine.deng@gmail.com>
Date: Mon, 17 Apr 2017 20:23:13 -0700
Subject: [PATCH 08/10] two blocks in reacher_agent now

---
 core/gym_wrapper.py   |   1 +
 envs/reacher_agent.py | 174 ++++++++++++++++++++++++------------------
 envs/stacker_agent.py |  11 +--
 3 files changed, 106 insertions(+), 80 deletions(-)

diff --git a/core/gym_wrapper.py b/core/gym_wrapper.py
index ebde5a8..3a08bf5 100644
--- a/core/gym_wrapper.py
+++ b/core/gym_wrapper.py
@@ -62,6 +62,7 @@ def _step(self, action):
     self.env.step(action)
     obs    = self._observation()
     reward = self.env.reward()
+    # done   = self.env.simulator.done # test?
     done   = False
     return obs, reward, done, dict(reward=reward)
 
diff --git a/envs/reacher_agent.py b/envs/reacher_agent.py
index 4789b4a..04d9f69 100644
--- a/envs/reacher_agent.py
+++ b/envs/reacher_agent.py
@@ -4,81 +4,85 @@
 from pyhelper_fns import vis_utils
 
 def contains(obj1, obj2):
-  x1, y1, _ = obj1.pos.tolist()
-  x2, y2, _ = obj2.pos.tolist()
+  if not obj1 or not obj2:
+    return False
+  x1, y1 = obj1.pos[0], obj1.pos[1]
+  x2, y2 = obj2.pos[0], obj2.pos[1]
   s1, s2 = obj1.size, obj2.size
-  if x2 + w2 <= x1 or x1 + w1 <= x2:
+  x1, y1 = x1, y1
+  x2, y2 = x2, y2
+  if x2 + 2 * s2 <= x1 or x1 + 2 * s1 <= x2:
     return False
-  if y2 + l2 <= y1 or y1 + l1 <= y2:
+  if y2 + 2 * s2 <= y1 or y1 + 2 * s1 <= y2:
     return False
   return True
 
 class StackedBox():
-
-  def __init__(self, pos, size=3):
-    assert len(pos) == 3, 'must enter an xyz position'
-
+  def __init__(self, pos, size=2):
     self.pos = pos.reshape((3,))
-    # height is the height in 3d, box in 2d is width x length
     self.size = size
 
 class ReacherArm():
-
-  def __init__(self, pos, size=4):
-    # takes in xyz position, but z is implicitly infinity
+  def __init__(self, pos, size=2):
     self.pos = pos.reshape((2,))
     self.size = size
 
-    self.hold_block = False
+    self.block = None
 
 class SimpleReacherSimulator(BaseSimulator):
   def __init__(self, **kwargs):
     # should always have same size blocks (width = height = length = n)
-    super(SimpleStackerSimulator, self).__init__(**kwargs)
+    super(SimpleReacherSimulator, self).__init__(**kwargs)
     self._imSz = 32
     self.width = self._imSz
     self.height = self._imSz
-    self.move_block = StackedBox(np.zeros(3))
-    self.other_block = StackedBox(np.zeros(3))
-
-    self.reacher_arm = ReacherArm(np.zeros(3))
+    self.move_block = [StackedBox(np.zeros(3)), StackedBox(np.zeros(3))]
+    self.goal = StackedBox(np.zeros(3))
+    self.reacher_arm = ReacherArm(np.zeros(2))
 
     self._im = np.zeros((self._imSz, self._imSz, 3), dtype=np.uint8)
-    self._range_min = 0
-    self._range_max = 32 - self.move_block.size
+    self._range_min = self.reacher_arm.size
+    self._range_max = 32 - self.reacher_arm.size
+
+  @property
+  def done(self):
+    for block in self.move_block:
+      if block.pos[2] != self.goal.size:
+        return False
+    return True
 
   @overrides
   def step(self, action):
-    # should take in xyz position, but z position doesn't matter
-    action = action.reshape((5,))
-    pos = action[:3]
-    grab_act = action[3] < 4
-    drop_act = action[4] < 4
-    print(grab_act, drop_act)
-    
-    if grab_act: 
-      if contains(self.reacher_arm, self.move_block):
-        self.reacher_arm.hold_block = True
-
-    elif drop_act:
-      if self.reacher_arm.hold_block:
-        self.reacher_arm.hold_block = False
-        self.move_block.pos[2] = 0
-        if contains(self.other_block, self.move_block):
-          self.move_block.pos[2] = self.other_block.size
-
-    if self.reacher_arm.hold_block:
-      delta = pos[0] - self.reacher_arm.pos[0], pos[1] - self.reacher_arm.pos[1]
-      self.move_block.pos = np.array([self.move_block.pos[0] + delta[0], 
-              self.move_block.pos[1] + delta[1], 10]) # to preserve relative position
-
-    self.reacher_arm.pos = pos
-
+    action = action.reshape((2,))
+    new_pos = self.reacher_arm.pos + action
+
+    on_goal = contains(self.reacher_arm.block, self.goal)
+    on_block = None
+    for block in self.move_block:
+      if contains(self.reacher_arm, block):
+        on_block = block
+        break
+
+    if self.reacher_arm.block is None and on_block is not None and not contains(on_block, self.goal):
+      self.reacher_arm.block = on_block
+
+    if on_goal:
+      self.reacher_arm.block.pos[2] = self.goal.size
+      self.reacher_arm.block = None
+
+    if self.reacher_arm.block is not None:
+      block = self.reacher_arm.block
+      delta = new_pos[0] - self.reacher_arm.pos[0], new_pos[1] - self.reacher_arm.pos[1]
+      block.pos = np.array([block.pos[0] + delta[0], 
+              block.pos[1] + delta[1], 32]) # to preserve relative position
+      block.pos = np.clip(block.pos, self._range_min, self._range_max)
+
+    self.reacher_arm.pos = np.clip(new_pos, self._range_min, self._range_max)
+  
   def _plot_object(self, coords, box, color='r'):
     x, y = coords
-    width, length = box[0], box[1]
-    mnx, mxx  = int(x), int(min(self._imSz, x + width))
-    mny, mxy  = int(y), int(min(self._imSz, y + length))
+    mnx, mxx  = int(max(x - box[0], 0)), int(min(self._imSz, x + box[0]))
+    mny, mxy  = int(max(y - box[1], 0)), int(min(self._imSz, y + box[1]))
     if color == 'r':
       self._im[mny:mxy, mnx:mxx, 0] = 255
     elif color == 'g':
@@ -91,12 +95,13 @@ def get_image(self):
     # other blocks in red, active block in green
     imSz = self._imSz
     self._im = np.zeros((imSz, imSz, 3), dtype=np.uint8)
-    x, y = self.other_block.pos[0], self.other_block.pos[1]
-    self._plot_object((x, y), (3, 3), 'r')
-    x, y = self.move_block.pos[0], self.move_block.pos[1]
-    self._plot_object((x, y), (3, 3), 'g')
+    x, y = self.goal.pos[0], self.goal.pos[1]
+    self._plot_object((x, y), (2, 2), 'r')
+    for block in self.move_block:
+      x, y = block.pos[0], block.pos[1]
+      self._plot_object((x, y), (2, 2), 'g')
     x, y = self.reacher_arm.pos[0], self.reacher_arm.pos[1]
-    self._plot_object((x, y), (4, 4), 'b')
+    self._plot_object((x, y), (2, 2), 'b')
     return self._im.copy()
 
   @overrides 
@@ -105,6 +110,8 @@ def _setup_renderer(self):
 
   @overrides
   def render(self):
+    if not hasattr(self, '_canvas'):
+      self._setup_renderer()
     self._canvas._display(self.get_image())
 
 class StackerIm(BaseObservation):
@@ -126,28 +133,42 @@ class StateIm(BaseObservation):
   @overrides
   def ndim(self):
     dim = {}
-    dim['im'] = (9, 1)
+    num_blocks = len(self.simulator.move_block)
+    dim['im'] = (num_blocks * 3 + 6, 1)
     return dim
 
+  def scale(self, obs):
+    obs = np.copy(obs)
+    obs = obs / 16 - 1
+    return obs
+
   @overrides
   def observation(self):
     obs = {}
-    new_im = np.append(self.simulator.move_block.pos, self.simulator.other_block.pos)
-    new_im = np.append(new_im, self.simulator.reacher_arm.pos[:2])
-    new_im = np.append(new_im, np.array([1 if self.simulator.reacher_arm.hold_block else 0]))
+
+    m_block = self.simulator.move_block
+    g_pos = self.simulator.goal.pos
+    r_pos = self.simulator.reacher_arm.pos
+    new_im = np.append(self.scale(g_pos), self.scale(r_pos))
+    new_im = np.append(new_im, np.array([1 if self.simulator.reacher_arm.block is not None else 0]))
+    for b in m_block:
+      new_im = np.append(new_im, self.scale(b.pos))
     obs['im'] = new_im
     return obs
 
 ####@pulkitag: This is good. 
 class RewardReacher(BaseRewarder):
-  
+
   @property
   def contains_block(self):
-    if hasattr(self.prms['sim'], 'reacher_arm') and hasattr(self.prms['sim'], 'move_block') and hasattr(self.prms['sim'], 'other_block'):
-      move_block = self.prms['sim'].move_block0,  0,  0,
-      other_block = self.prms['sim'].other_block
-      reacher_arm = self.prms['sim'].reacher_arm
-      return 1 if move_block.pos[2] == other_block.size and not reacher_arm.hold_block else 0
+    if hasattr(self.prms['sim'], 'move_block') and hasattr(self.prms['sim'], 'goal'):
+      move_block = self.prms['sim'].move_block
+      goal = self.prms['sim'].goal
+      reward = 0
+      for block in move_block:
+        if block.pos[2] == goal.size:
+          reward += 1  
+      return reward / float(len(move_block))
     return 0
 
   @overrides
@@ -158,30 +179,33 @@ def get(self):
 class ContinuousReacherAction(BaseContinuousAction):
   @overrides
   def action_dim(self):
-    return 5
+    return 2
 
   @overrides
   def process(self, action):
-    return np.around(action, decimals=0)
+    return action
 
   def minval(self):
-    return 0
+    return -1
 
   def maxval(self):
-    return 32
+    return 1
 
 class InitReacher(BaseInitializer):
   @overrides
   def sample_env_init(self):
     sim = self.simulator['sim']
-    sim.reacher_arm.pos = np.random.randint(0, 32 - sim.reacher_arm.size, size=3)
-    sim.reacher_arm.hold_block = False
-    sim.move_block.pos = np.random.randint(0, 32 - sim.move_block.size, size=3)
-    sim.move_block.pos[2] = 0
-    sim.other_block.pos = np.random.randint(0, 32 - sim.move_block.size, size=3)
-    sim.other_block.pos[2] = 0
-    if contains(sim.other_block, sim.move_block):
-      sim.move_block.pos[2] = sim.other_block.size
+    sim.reacher_arm.pos = np.random.randint(sim._range_min, sim._range_max, size=2)
+    sim.reacher_arm.block = None
+    
+    sim.goal.pos = np.random.randint(sim._range_min, sim._range_max, size=3)
+    sim.goal.pos[2] = 0
+    for block in sim.move_block:
+      block.pos = np.random.randint(sim._range_min, sim._range_max, size=3)
+      block.pos[2] = 0
+      if contains(sim.goal, block):
+        block.pos[2] = sim.goal.size
+
 
 def get_environment(obsType='StateIm', max_episode_length=100, initPrms={}, obsPrms={}, rewPrms={}, actPrms={}):
   sim = SimpleReacherSimulator()
diff --git a/envs/stacker_agent.py b/envs/stacker_agent.py
index d97c84f..d8b81ce 100644
--- a/envs/stacker_agent.py
+++ b/envs/stacker_agent.py
@@ -4,9 +4,11 @@
 from pyhelper_fns import vis_utils
 
 def contains(obj1, obj2):
-  x1, y1, _ = obj1.pos.tolist()
-  x2, y2, _ = obj2.pos.tolist()
+  x1, y1 = obj1.pos[0], obj1.pos[1]
+  x2, y2 = obj2.pos[0], obj2.pos[1]
   s1, s2 = obj1.size, obj2.size
+  x1, y1 = x1 - s1, y1 - s1
+  x2, y2 = x2 - s2, y2 - s2
   if x2 + s2 <= x1 or x1 + s1 <= x2:
     return False
   if y2 + s2 <= y1 or y1 + s1 <= y2:
@@ -14,7 +16,6 @@ def contains(obj1, obj2):
   return True
 
 class StackedBox():
-
   def __init__(self, pos, size=2):
     # position is center of the box
     self.pos = pos.reshape((3,))
@@ -48,8 +49,8 @@ def step(self, pos):
 
   def _plot_object(self, coords, color='r'):
     x, y = coords
-    mnx, mxx  = int(x), int(min(self._imSz, x + self.move_block.size))
-    mny, mxy  = int(y), int(min(self._imSz, y + self.move_block.size))
+    mnx, mxx  = max(x - self.move_block.size, 0), min(self._imSz, x + self.move_block.size)
+    mny, mxy  = max(y - self.move_block.size, 0), min(self._imSz, y + self.move_block.size)
     if color == 'r':
       self._im[mny:mxy, mnx:mxx, 0] = 255
     elif color == 'g':

From 531715eaa857c9e6fa2bec8bde979d778fd51e57 Mon Sep 17 00:00:00 2001
From: JasmineDeng <jasmine.deng@gmail.com>
Date: Fri, 21 Apr 2017 17:32:20 -0700
Subject: [PATCH 09/10] working training for 1 and 2 blocks

---
 core/gym_wrapper.py   |  3 +--
 envs/reacher_agent.py | 30 ++++++++++++------------------
 2 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/core/gym_wrapper.py b/core/gym_wrapper.py
index 3a08bf5..acb490d 100644
--- a/core/gym_wrapper.py
+++ b/core/gym_wrapper.py
@@ -62,8 +62,7 @@ def _step(self, action):
     self.env.step(action)
     obs    = self._observation()
     reward = self.env.reward()
-    # done   = self.env.simulator.done # test?
-    done   = False
+    done   = self.env.simulator.done # test?
     return obs, reward, done, dict(reward=reward)
 
 
diff --git a/envs/reacher_agent.py b/envs/reacher_agent.py
index 04d9f69..1e958b7 100644
--- a/envs/reacher_agent.py
+++ b/envs/reacher_agent.py
@@ -30,31 +30,30 @@ def __init__(self, pos, size=2):
     self.block = None
 
 class SimpleReacherSimulator(BaseSimulator):
-  def __init__(self, **kwargs):
+  def __init__(self, num_blocks=2, **kwargs):
     # should always have same size blocks (width = height = length = n)
     super(SimpleReacherSimulator, self).__init__(**kwargs)
     self._imSz = 32
     self.width = self._imSz
     self.height = self._imSz
-    self.move_block = [StackedBox(np.zeros(3)), StackedBox(np.zeros(3))]
+    self.move_block = [StackedBox(np.zeros(3)) for i in range(num_blocks)]
     self.goal = StackedBox(np.zeros(3))
     self.reacher_arm = ReacherArm(np.zeros(2))
 
     self._im = np.zeros((self._imSz, self._imSz, 3), dtype=np.uint8)
     self._range_min = self.reacher_arm.size
-    self._range_max = 32 - self.reacher_arm.size
+    self._range_max = self._imSz - self.reacher_arm.size
 
   @property
   def done(self):
     for block in self.move_block:
-      if block.pos[2] != self.goal.size:
+      if not contains(block, self.goal):
         return False
     return True
 
   @overrides
   def step(self, action):
     action = action.reshape((2,))
-    new_pos = self.reacher_arm.pos + action
 
     on_goal = contains(self.reacher_arm.block, self.goal)
     on_block = None
@@ -70,14 +69,13 @@ def step(self, action):
       self.reacher_arm.block.pos[2] = self.goal.size
       self.reacher_arm.block = None
 
+    self.reacher_arm.pos = np.clip(self.reacher_arm.pos + action, self._range_min, self._range_max)
+
     if self.reacher_arm.block is not None:
       block = self.reacher_arm.block
-      delta = new_pos[0] - self.reacher_arm.pos[0], new_pos[1] - self.reacher_arm.pos[1]
-      block.pos = np.array([block.pos[0] + delta[0], 
-              block.pos[1] + delta[1], 32]) # to preserve relative position
-      block.pos = np.clip(block.pos, self._range_min, self._range_max)
-
-    self.reacher_arm.pos = np.clip(new_pos, self._range_min, self._range_max)
+      action = np.append(action, 0)
+      block.pos = np.clip(block.pos + action, self._range_min, self._range_max)
+      block.pos[2] = self._imSz
   
   def _plot_object(self, coords, box, color='r'):
     x, y = coords
@@ -139,7 +137,7 @@ def ndim(self):
 
   def scale(self, obs):
     obs = np.copy(obs)
-    obs = obs / 16 - 1
+    obs = obs / float(self.simulator._imSz) * 2 - 1
     return obs
 
   @overrides
@@ -164,11 +162,8 @@ def contains_block(self):
     if hasattr(self.prms['sim'], 'move_block') and hasattr(self.prms['sim'], 'goal'):
       move_block = self.prms['sim'].move_block
       goal = self.prms['sim'].goal
-      reward = 0
-      for block in move_block:
-        if block.pos[2] == goal.size:
-          reward += 1  
-      return reward / float(len(move_block))
+      reward = 1 if all([contains(block, goal) for block in move_block]) else 0
+      return reward
     return 0
 
   @overrides
@@ -206,7 +201,6 @@ def sample_env_init(self):
       if contains(sim.goal, block):
         block.pos[2] = sim.goal.size
 
-
 def get_environment(obsType='StateIm', max_episode_length=100, initPrms={}, obsPrms={}, rewPrms={}, actPrms={}):
   sim = SimpleReacherSimulator()
   initPrms = {'sim' : sim}

From 5a167850b0e69021b26cc5cf5d9da49c0df1e736 Mon Sep 17 00:00:00 2001
From: JasmineDeng <jasmine.deng@gmail.com>
Date: Fri, 21 Apr 2017 17:33:20 -0700
Subject: [PATCH 10/10] modified to use if height instead of contains for
 checking when terminal

---
 envs/reacher_agent.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/envs/reacher_agent.py b/envs/reacher_agent.py
index 1e958b7..26da313 100644
--- a/envs/reacher_agent.py
+++ b/envs/reacher_agent.py
@@ -47,7 +47,7 @@ def __init__(self, num_blocks=2, **kwargs):
   @property
   def done(self):
     for block in self.move_block:
-      if not contains(block, self.goal):
+      if block.pos[2] != self.goal.size:
         return False
     return True
 
@@ -162,7 +162,7 @@ def contains_block(self):
     if hasattr(self.prms['sim'], 'move_block') and hasattr(self.prms['sim'], 'goal'):
       move_block = self.prms['sim'].move_block
       goal = self.prms['sim'].goal
-      reward = 1 if all([contains(block, goal) for block in move_block]) else 0
+      reward = 1 if all([block.pos[2] == goal.size for block in move_block]) else 0
       return reward
     return 0