From 2a3b8d7262e2f79269c8216a1d01f84207876fe3 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Thu, 13 Feb 2020 22:46:30 -0800
Subject: [PATCH 01/53] allow order objects by position in view, reposition
 camera, narrower camera view angle for pioneer2dx_noplugin

---
 .../models/pioneer2dx_noplugin/model.sdf      |   6 +-
 python/social_bot/tasks.py                    | 100 +++++++++++-------
 2 files changed, 66 insertions(+), 40 deletions(-)
diff --git a/python/social_bot/models/pioneer2dx_noplugin/model.sdf b/python/social_bot/models/pioneer2dx_noplugin/model.sdf
index f58365c1..045ba5c6 100644
--- a/python/social_bot/models/pioneer2dx_noplugin/model.sdf
+++ b/python/social_bot/models/pioneer2dx_noplugin/model.sdf
@@ -63,7 +63,7 @@
       </visual>
     </link>
     <link name="camera_link">
-      <pose>0.25 0 0.25 0 0 0</pose>
+      <pose>0.25 0 0.20 0 0 0</pose>
       <visual name="visual">
         <geometry>
           <box>
@@ -73,13 +73,13 @@
       </visual>
       <sensor name="camera" type="camera">
         <camera>
-          <horizontal_fov>2.0944</horizontal_fov>
+          <horizontal_fov>1.0472</horizontal_fov>
           <image>
             <width>128</width>
             <height>128</height>
           </image>
           <clip>
-            <near>0.1</near>
+            <near>0.001</near>
             <far>100</far>
           </clip>
         </camera>
diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index 60f7ee22..8ff4838e 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -17,6 +17,7 @@
 
 import math
 import numpy as np
+import operator
 import os
 import gin
 import itertools
@@ -169,44 +170,48 @@ class GoalTask(Task):
     it will get reward -1.
     """
 
-    def __init__(self,
-                 env,
-                 max_steps,
-                 goal_name="ball",
-                 distraction_list=[
-                     'coke_can', 'table', 'car_wheel', 'plastic_cup', 'beer'
-                 ],
-                 success_distance_thresh=0.5,
-                 fail_distance_thresh=2.0,
-                 distraction_penalty_distance_thresh=0,
-                 distraction_penalty=0.5,
-                 random_agent_orientation=False,
-                 sparse_reward=True,
-                 random_range=5.0,
-                 polar_coord=True,
-                 random_goal=False,
-                 use_curriculum_training=False,
-                 curriculum_distractions=True,
-                 curriculum_target_angle=False,
-                 switch_goal_within_episode=False,
-                 start_range=0,
-                 increase_range_by_percent=50.,
-                 reward_thresh_to_increase_range=0.4,
-                 percent_full_range_in_curriculum=0.1,
-                 max_reward_q_length=100,
-                 reward_weight=1.0,
-                 move_goal_during_episode=True,
-                 end_episode_after_success=False,
-                 success_with_angle_requirement=True,
-                 additional_observation_list=[],
-                 use_egocentric_states=False,
-                 egocentric_perception_range=0):
+    def __init__(
+            self,
+            env,
+            max_steps,
+            goal_name="ball",
+            distraction_list=[
+                'coke_can', 'table', 'car_wheel', 'plastic_cup', 'beer'
+            ],
+            end_on_hitting_distraction=False,
+            end_episode_after_success=False,
+            success_distance_thresh=0.5,
+            fail_distance_thresh=2.0,
+            distraction_penalty_distance_thresh=0,
+            distraction_penalty=0.5,
+            random_agent_orientation=False,
+            sparse_reward=True,
+            random_range=5.0,
+            polar_coord=True,
+            random_goal=False,
+            use_curriculum_training=False,
+            curriculum_distractions=True,
+            curriculum_target_angle=False,
+            switch_goal_within_episode=False,
+            start_range=0,
+            increase_range_by_percent=50.,
+            reward_thresh_to_increase_range=0.4,
+            percent_full_range_in_curriculum=0.1,
+            max_reward_q_length=100,
+            reward_weight=1.0,
+            move_goal_during_episode=True,
+            success_with_angle_requirement=True,
+            additional_observation_list=[],
+            use_egocentric_states=False,
+            egocentric_perception_range=0,
+            order_obj_by_view=False):
         """
         Args:
             env (gym.Env): an instance of Environment
             max_steps (int): episode will end if not reaching gaol in so many steps
             goal_name (string): name of the goal in the world
             distraction_list (list of string): a list of model. the model shoud be in gazebo database
+            end_on_hitting_distraction (bool): whether to end episode on hitting distraction
             success_distance_thresh (float): the goal is reached if it's within this distance to the agent
             fail_distance_thresh (float): if the agent moves away from the goal more than this distance,
                 it's considered a failure and is given reward -1
@@ -247,6 +252,8 @@ def __init__(self,
             egocentric_perception_range (float): the max range in degree to limit the agent's observation.
                 E.g. 60 means object is only visible when it's within +/-60 degrees in front of the agent's
                 direction (yaw).
+            order_obj_by_view (bool): False to order object coordinates (full states) in object id order;
+                True to order according to y axis value (left to right) together with object id.
         """
         self._max_play_ground_size = 5  # play ground will be (-5, 5) for both x and y axes.
         # TODO: Remove the default grey walls in the play ground world file,
@@ -255,6 +262,7 @@ def __init__(self,
         super().__init__(
             env=env, max_steps=max_steps, reward_weight=reward_weight)
         self._goal_name = goal_name
+        self.end_on_hitting_distraction = end_on_hitting_distraction
         self._success_distance_thresh = success_distance_thresh
         self._fail_distance_thresh = fail_distance_thresh
         self._distraction_penalty_distance_thresh = distraction_penalty_distance_thresh
@@ -294,6 +302,7 @@ def __init__(self,
         self._polar_coord = polar_coord
         self._use_egocentric_states = use_egocentric_states
         self._egocentric_perception_range = egocentric_perception_range
+        self._order_obj_by_view = order_obj_by_view
         if self.should_use_curriculum_training():
             self._orig_random_range = random_range
             self._random_range = start_range
@@ -306,8 +315,8 @@ def __init__(self,
             if curriculum_target_angle:
                 angle_str = ", start_angle {}".format(self._random_angle)
             logging.info(
-                "start_range %f%s, reward_thresh_to_increase_range %f",
-                self._start_range, angle_str,
+                "Env %d: start_range %f%s, reward_thresh_to_increase_range %f",
+                self._env._port, self._start_range, angle_str,
                 self._reward_thresh_to_increase_range)
         else:
             self._random_range = random_range
@@ -328,13 +337,15 @@ def _push_reward_queue(self, value):
                 self._reward_thresh_to_increase_range):
             if self._curriculum_target_angle:
                 self._random_angle += 20
-                logging.info("Raising random_angle to %d", self._random_angle)
+                logging.info("Env %d: Raising random_angle to %d",
+                             self._env._port, self._random_angle)
             if (not self._curriculum_target_angle or self._random_angle > 360):
                 self._random_angle = 60
                 new_range = min((1. + self._increase_range_by_percent) *
                                 self._random_range, self._orig_random_range)
                 if self._random_range < self._orig_random_range:
-                    logging.info("Raising random_range to %f", new_range)
+                    logging.info("Env %d: Raising random_range to %f",
+                                 self._env._port, new_range)
                 self._random_range = new_range
             self._q.clear()
 
@@ -417,12 +428,14 @@ def run(self):
                 else:
                     reward = (self._prev_dist - dist) / self._initial_dist
                 reward = reward - distraction_penalty
+                done = False
                 if distraction_penalty > 0:
                     logging.debug("yielding reward: " + str(reward))
                     self._push_reward_queue(0)
+                    done = self.end_on_hitting_distraction
                 self._prev_dist = dist
                 agent_sentence = yield TeacherAction(
-                    reward=reward, sentence=self._goal_name)
+                    reward=reward, sentence=self._goal_name, done=done)
         reward = -1.0
         logging.debug("yielding reward: {}, took more than {} steps".format(
             str(reward), str(self._max_steps)))
@@ -597,6 +610,7 @@ def task_specific_observation(self, agent):
             else:
                 pose = np.concatenate((pose, obj_pos), axis=0)
 
+        obj_array = []
         agent_pose = np.array(agent.get_pose()).flatten()
         if self._use_egocentric_states:
             yaw = agent_pose[5]
@@ -604,6 +618,12 @@ def task_specific_observation(self, agent):
             vx, vy, vz, a1, a2, a3 = np.array(agent.get_velocities()).flatten()
             rvx, rvy = agent.get_egocentric_cord_2d(vx, vy, yaw)
             obs = [rvx, rvy, vz, a1, a2, a3]
+            if self._egocentric_perception_range > 0:
+                obj_array.extend([[0, rvx, rvy, vz], [1, a1, a2, a3]])
+                i = 2
+            else:
+                obj_array.extend([[0, rvx, rvy], [1, vz, a1], [2, a2, a3]])
+                i = 3
             # adds objects' (goal's as well as distractions') egocentric
             # coordinates to observation
             while len(pose) > 1:
@@ -621,8 +641,10 @@ def task_specific_observation(self, agent):
                         rotated_x = 0.
                         rotated_y = 0.
                         magnitude = 0.
+                    obj_array.append([i, rotated_x, rotated_y, magnitude])
                     obs.extend([rotated_x, rotated_y, magnitude])
                 else:
+                    obj_array.append([i, rotated_x, rotated_y])
                     obs.extend([rotated_x, rotated_y])
                 pose = pose[3:]
             obs = np.array(obs)
@@ -631,6 +653,10 @@ def task_specific_observation(self, agent):
             joints_states = agent.get_internal_states()
             obs = np.concatenate((pose, agent_pose, agent_vel, joints_states),
                                  axis=0)
+
+        if self._order_obj_by_view:
+            obj_array.sort(key=operator.itemgetter(2))
+            obs = np.array(obj_array).flatten()
         return obs
 
 

From f37876ad9597c30f6e513406b8c50eec017d332f Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Fri, 14 Feb 2020 15:21:06 -0800
Subject: [PATCH 02/53] field of view back to 120 degrees

---
 python/social_bot/models/pioneer2dx_noplugin/model.sdf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/social_bot/models/pioneer2dx_noplugin/model.sdf b/python/social_bot/models/pioneer2dx_noplugin/model.sdf
index 045ba5c6..9da8cc9a 100644
--- a/python/social_bot/models/pioneer2dx_noplugin/model.sdf
+++ b/python/social_bot/models/pioneer2dx_noplugin/model.sdf
@@ -73,7 +73,7 @@
       </visual>
       <sensor name="camera" type="camera">
         <camera>
-          <horizontal_fov>1.0472</horizontal_fov>
+          <horizontal_fov>2.0944</horizontal_fov>
           <image>
             <width>128</width>
             <height>128</height>

From 781e6c2633ac690f4323dd843408f0e2bcd4fecf Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Thu, 14 May 2020 14:24:16 -0700
Subject: [PATCH 03/53] add agent's absolute position and pose into observation

---
 python/social_bot/tasks.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index 8ff4838e..32df1082 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -617,13 +617,19 @@ def task_specific_observation(self, agent):
             # adds egocentric velocity input
             vx, vy, vz, a1, a2, a3 = np.array(agent.get_velocities()).flatten()
             rvx, rvy = agent.get_egocentric_cord_2d(vx, vy, yaw)
-            obs = [rvx, rvy, vz, a1, a2, a3]
+            obs = [rvx, rvy, vz, a1, a2, a3] + list(agent_pose)
             if self._egocentric_perception_range > 0:
-                obj_array.extend([[0, rvx, rvy, vz], [1, a1, a2, a3]])
-                i = 2
+                obj_array.extend(
+                    [[0, rvx, rvy, vz], [1, a1, a2, a3],
+                     [2, agent_pose[0], agent_pose[1], agent_pose[2]],
+                     [3, agent_pose[3], agent_pose[4], agent_pose[5]]])
+                i = 4
             else:
-                obj_array.extend([[0, rvx, rvy], [1, vz, a1], [2, a2, a3]])
-                i = 3
+                obj_array.extend([[0, rvx, rvy], [1, vz, a1], [2, a2, a3],
+                                  [3, agent_pose[0], agent_pose[1]],
+                                  [4, agent_pose[2], agent_pose[3]],
+                                  [5, agent_pose[4], agent_pose[5]]])
+                i = 6
             # adds objects' (goal's as well as distractions') egocentric
             # coordinates to observation
             while len(pose) > 1:

From ab15b68cc624b2b0b79a1b346e2cc40cd854d8cd Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Mon, 27 Jul 2020 23:13:36 -0700
Subject: [PATCH 04/53] non-egocentric full state observation for GoalTask

---
 python/social_bot/tasks.py | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index 32df1082..bf99e976 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -247,6 +247,7 @@ def __init__(
                 flag will overwrite the effects of flags ``switch_goal_within_episode`` and ``move_goal_during_episode``.
             success_with_angle_requirement: if True then calculate the reward considering the angular requirement
             additional_observation_list: a list of additonal objects to be added
+            use_full_states (bool): For non-image observation, use full states of the world
             use_egocentric_states (bool): For the non-image observation case, use the states transformed to
                 egocentric coordinate, e.g., agent's egocentric distance and direction to goal
             egocentric_perception_range (float): the max range in degree to limit the agent's observation.
@@ -300,6 +301,7 @@ def __init__(
                       self._max_play_ground_size)))
         self._pos_list.remove((0, 0))
         self._polar_coord = polar_coord
+        self._use_full_states = use_full_states
         self._use_egocentric_states = use_egocentric_states
         self._egocentric_perception_range = egocentric_perception_range
         self._order_obj_by_view = order_obj_by_view
@@ -320,6 +322,19 @@ def __init__(
                 self._reward_thresh_to_increase_range)
         else:
             self._random_range = random_range
+        obs_format = "image"
+        if use_full_states or use_egocentric_states:
+            obs_format = "full_state"
+        obs_relative = "ego"
+        if use_full_states and not use_egocentric_states:
+            obs_relative = "absolute"
+        logging.info("Observations: {}, {}.".format(obs_format, obs_relative))
+        if use_full_states and not use_egocentric_states:
+            if not order_obj_by_view:
+                logging.info(
+                    "Dims: 0-5: agent's velocity and angular " +
+                    "velocity, 6-11: agent's position and pose, 12-13: goal x, y"
+                    + ", all distractions' x, y coordinates.")
         self.task_vocab += self._object_list
         self._env.insert_model_list(self._object_list)
 
@@ -612,11 +627,14 @@ def task_specific_observation(self, agent):
 
         obj_array = []
         agent_pose = np.array(agent.get_pose()).flatten()
-        if self._use_egocentric_states:
+        if self._use_full_states or self._use_egocentric_states:
             yaw = agent_pose[5]
             # adds egocentric velocity input
             vx, vy, vz, a1, a2, a3 = np.array(agent.get_velocities()).flatten()
-            rvx, rvy = agent.get_egocentric_cord_2d(vx, vy, yaw)
+            if self._use_egocentric_states:
+                rvx, rvy = agent.get_egocentric_cord_2d(vx, vy, yaw)
+            else:
+                rvx, rvy = vx, vy
             obs = [rvx, rvy, vz, a1, a2, a3] + list(agent_pose)
             if self._egocentric_perception_range > 0:
                 obj_array.extend(
@@ -633,9 +651,15 @@ def task_specific_observation(self, agent):
             # adds objects' (goal's as well as distractions') egocentric
             # coordinates to observation
             while len(pose) > 1:
-                x = pose[0] - agent_pose[0]
-                y = pose[1] - agent_pose[1]
-                rotated_x, rotated_y = agent.get_egocentric_cord_2d(x, y, yaw)
+                x = pose[0]
+                y = pose[1]
+                if self._use_egocentric_states:
+                    x = pose[0] - agent_pose[0]
+                    y = pose[1] - agent_pose[1]
+                    rotated_x, rotated_y = agent.get_egocentric_cord_2d(
+                        x, y, yaw)
+                else:
+                    rotated_x, rotated_y = x, y
                 if self._egocentric_perception_range > 0:
                     dist = math.sqrt(rotated_x * rotated_x +
                                      rotated_y * rotated_y)
@@ -661,6 +685,7 @@ def task_specific_observation(self, agent):
                                  axis=0)
 
         if self._order_obj_by_view:
+            # TODO: skip first few dims corresponding to rvx, a1 etc..
             obj_array.sort(key=operator.itemgetter(2))
             obs = np.array(obj_array).flatten()
         return obs

From 4b1dfd12ae70f81dc26105a33dff24c472c118c8 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Wed, 29 Jul 2020 21:50:37 -0700
Subject: [PATCH 05/53] goal conditioned GoalTask and PlayGround env, with -1/0
 reward

---
 python/social_bot/envs/play_ground.py | 31 +++++++++++++++++++++++----
 python/social_bot/tasks.py            | 12 +++++++++--
 2 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/python/social_bot/envs/play_ground.py b/python/social_bot/envs/play_ground.py
index 54a7453e..4cee7fdd 100644
--- a/python/social_bot/envs/play_ground.py
+++ b/python/social_bot/envs/play_ground.py
@@ -14,6 +14,7 @@
 """
 A simple enviroment for an agent playing on the ground
 """
+from collections import OrderedDict
 import os
 import time
 import math
@@ -71,6 +72,7 @@ def __init__(self,
                  agent_type='pioneer2dx_noplugin',
                  world_name="play_ground.world",
                  tasks=[GoalTask],
+                 goal_conditioned=False,
                  with_language=False,
                  with_agent_language=False,
                  use_image_observation=False,
@@ -93,6 +95,7 @@ def __init__(self,
             world_name (string): Select the world file, e.g., empty.world, play_ground.world,
                 grocery_ground.world
             tasks (list): a list of teacher.Task, e.g., GoalTask, KickingBallTask
+            goal_conditioned (bool): Turn on goal conditioned tasks.
             with_language (bool): The observation will be a dict with an extra sentence
             with_agent_language (bool): Include agent sentence in action space.
             use_image_observation (bool): Use image, or use low-dimentional states as
@@ -127,6 +130,7 @@ def __init__(self,
         """
 
         self._action_cost = action_cost
+        self._goal_conditioned = goal_conditioned
         self._with_language = with_language
         self._seq_length = vocab_sequence_length
         self._with_agent_language = with_language and with_agent_language
@@ -186,6 +190,27 @@ def __init__(self,
         self.reset()
         self.observation_space = self._agent.get_observation_space(
             self._teacher)
+        if self._goal_conditioned:
+            assert not with_language
+            assert not use_image_observation
+            assert not image_with_internal_states
+            # state observation
+            goal_space = gym.spaces.Box(
+                low=-np.inf, high=np.inf, shape=(2, ), dtype=np.float32)
+            ag_space = goal_space
+            self.observation_space = gym.spaces.Dict(
+                observation=self.observation_space,
+                desired_goal=goal_space,
+                achieved_goal=ag_space)
+
+    def _get_observation(self, teacher_sentence):
+        obs = self._agent.get_observation(self._teacher, teacher_sentence)
+        if self._goal_conditioned:
+            orig = obs
+            obs = OrderedDict(observation=obs)
+            obs['achieved_goal'] = orig[6:8]
+            obs['desired_goal'] = orig[12:14]
+        return obs
 
     def reset(self):
         """
@@ -202,8 +227,7 @@ def reset(self):
         # The first call of "teach() after "done" will reset the task
         teacher_action = self._teacher.teach("")
         self._world.step(self._sub_steps)
-        obs = self._agent.get_observation(self._teacher,
-                                          teacher_action.sentence)
+        obs = self._get_observation(teacher_action.sentence)
         return obs
 
     def step(self, action):
@@ -229,8 +253,7 @@ def step(self, action):
         self._agent.take_action(controls)
         self._world.step(self._sub_steps)
         teacher_action = self._teacher.teach(sentence)
-        obs = self._agent.get_observation(self._teacher,
-                                          teacher_action.sentence)
+        obs = self._get_observation(teacher_action.sentence)
         self._steps_in_this_episode += 1
         ctrl_cost = np.sum(np.square(controls)) / controls.shape[0]
         reward = teacher_action.reward - self._action_cost * ctrl_cost
diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index bf99e976..c44f51dd 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -178,6 +178,7 @@ def __init__(
             distraction_list=[
                 'coke_can', 'table', 'car_wheel', 'plastic_cup', 'beer'
             ],
+            goal_conditioned=False,
             end_on_hitting_distraction=False,
             end_episode_after_success=False,
             success_distance_thresh=0.5,
@@ -211,6 +212,8 @@ def __init__(
             max_steps (int): episode will end if not reaching gaol in so many steps
             goal_name (string): name of the goal in the world
             distraction_list (list of string): a list of model. the model shoud be in gazebo database
+            end_episode_after_success (bool): if True, the episode will end once the goal is reached. A True value of this
+                flag will overwrite the effects of flags ``switch_goal_within_episode`` and ``move_goal_during_episode``.
             end_on_hitting_distraction (bool): whether to end episode on hitting distraction
             success_distance_thresh (float): the goal is reached if it's within this distance to the agent
             fail_distance_thresh (float): if the agent moves away from the goal more than this distance,
@@ -243,8 +246,6 @@ def __init__(
             max_reward_q_length (int): how many recent rewards to consider when estimating agent accuracy.
             reward_weight (float): the weight of the reward, is used in multi-task case
             move_goal_during_episode (bool): if True, the goal will be moved during episode, when it has been achieved
-            end_episode_after_success (bool): if True, the episode will end once the goal is reached. A True value of this
-                flag will overwrite the effects of flags ``switch_goal_within_episode`` and ``move_goal_during_episode``.
             success_with_angle_requirement: if True then calculate the reward considering the angular requirement
             additional_observation_list: a list of additonal objects to be added
             use_full_states (bool): For non-image observation, use full states of the world
@@ -263,6 +264,7 @@ def __init__(
         super().__init__(
             env=env, max_steps=max_steps, reward_weight=reward_weight)
         self._goal_name = goal_name
+        self._goal_conditioned = goal_conditioned
         self.end_on_hitting_distraction = end_on_hitting_distraction
         self._success_distance_thresh = success_distance_thresh
         self._fail_distance_thresh = fail_distance_thresh
@@ -411,11 +413,15 @@ def run(self):
                 self._get_distraction_penalty(loc, dot,
                                               prev_min_dist_to_distraction))
 
+            # TODO(Le): compare achieved goal with desired goal if task is
+            # goal conditioned?
             if dist < self._success_distance_thresh and (
                     not self._success_with_angle_requirement or dot > 0.707):
                 # within 45 degrees of the agent direction
                 reward = 1.0 - distraction_penalty
                 self._push_reward_queue(max(reward, 0))
+                if self._goal_conditioned:
+                    reward -= 1.
                 logging.debug("yielding reward: " + str(reward))
                 agent_sentence = yield TeacherAction(
                     reward=reward, sentence="well done",
@@ -440,6 +446,8 @@ def run(self):
             else:
                 if self._sparse_reward:
                     reward = 0
+                    if self._goal_conditioned:
+                        reward = -1
                 else:
                     reward = (self._prev_dist - dist) / self._initial_dist
                 reward = reward - distraction_penalty

From fd43286da83464c0e420191013b5f8f396f25e8f Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Wed, 29 Jul 2020 23:33:51 -0700
Subject: [PATCH 06/53] simplify

---
 python/social_bot/envs/play_ground.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/python/social_bot/envs/play_ground.py b/python/social_bot/envs/play_ground.py
index 4cee7fdd..d1b4a23d 100644
--- a/python/social_bot/envs/play_ground.py
+++ b/python/social_bot/envs/play_ground.py
@@ -197,19 +197,18 @@ def __init__(self,
             # state observation
             goal_space = gym.spaces.Box(
                 low=-np.inf, high=np.inf, shape=(2, ), dtype=np.float32)
-            ag_space = goal_space
             self.observation_space = gym.spaces.Dict(
                 observation=self.observation_space,
                 desired_goal=goal_space,
-                achieved_goal=ag_space)
+                achieved_goal=goal_space)
 
     def _get_observation(self, teacher_sentence):
         obs = self._agent.get_observation(self._teacher, teacher_sentence)
         if self._goal_conditioned:
-            orig = obs
+            flat_obs = obs
             obs = OrderedDict(observation=obs)
-            obs['achieved_goal'] = orig[6:8]
-            obs['desired_goal'] = orig[12:14]
+            obs['achieved_goal'] = flat_obs[6:8]
+            obs['desired_goal'] = flat_obs[12:14]
         return obs
 
     def reset(self):

From e2c6130cf1f0006399a8e0f478be3b2695c36a12 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Wed, 29 Jul 2020 23:55:23 -0700
Subject: [PATCH 07/53] fix after rebase

---
 python/social_bot/tasks.py | 86 ++++++++++++++++++++------------------
 1 file changed, 45 insertions(+), 41 deletions(-)

diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index c44f51dd..d9b6c49f 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -170,42 +170,42 @@ class GoalTask(Task):
     it will get reward -1.
     """
 
-    def __init__(
-            self,
-            env,
-            max_steps,
-            goal_name="ball",
-            distraction_list=[
-                'coke_can', 'table', 'car_wheel', 'plastic_cup', 'beer'
-            ],
-            goal_conditioned=False,
-            end_on_hitting_distraction=False,
-            end_episode_after_success=False,
-            success_distance_thresh=0.5,
-            fail_distance_thresh=2.0,
-            distraction_penalty_distance_thresh=0,
-            distraction_penalty=0.5,
-            random_agent_orientation=False,
-            sparse_reward=True,
-            random_range=5.0,
-            polar_coord=True,
-            random_goal=False,
-            use_curriculum_training=False,
-            curriculum_distractions=True,
-            curriculum_target_angle=False,
-            switch_goal_within_episode=False,
-            start_range=0,
-            increase_range_by_percent=50.,
-            reward_thresh_to_increase_range=0.4,
-            percent_full_range_in_curriculum=0.1,
-            max_reward_q_length=100,
-            reward_weight=1.0,
-            move_goal_during_episode=True,
-            success_with_angle_requirement=True,
-            additional_observation_list=[],
-            use_egocentric_states=False,
-            egocentric_perception_range=0,
-            order_obj_by_view=False):
+    def __init__(self,
+                 env,
+                 max_steps,
+                 goal_name="ball",
+                 distraction_list=[
+                     'coke_can', 'table', 'car_wheel', 'plastic_cup', 'beer'
+                 ],
+                 goal_conditioned=False,
+                 end_on_hitting_distraction=False,
+                 end_episode_after_success=False,
+                 success_distance_thresh=0.5,
+                 fail_distance_thresh=2.0,
+                 distraction_penalty_distance_thresh=0,
+                 distraction_penalty=0.5,
+                 random_agent_orientation=False,
+                 sparse_reward=True,
+                 random_range=5.0,
+                 polar_coord=True,
+                 random_goal=False,
+                 use_curriculum_training=False,
+                 curriculum_distractions=True,
+                 curriculum_target_angle=False,
+                 switch_goal_within_episode=False,
+                 start_range=0,
+                 increase_range_by_percent=50.,
+                 reward_thresh_to_increase_range=0.4,
+                 percent_full_range_in_curriculum=0.1,
+                 max_reward_q_length=100,
+                 reward_weight=1.0,
+                 move_goal_during_episode=True,
+                 success_with_angle_requirement=True,
+                 additional_observation_list=[],
+                 use_full_states=False,
+                 use_egocentric_states=False,
+                 egocentric_perception_range=0,
+                 order_obj_by_view=False):
         """
         Args:
             env (gym.Env): an instance of Environment
@@ -424,7 +424,8 @@ def run(self):
                     reward -= 1.
                 logging.debug("yielding reward: " + str(reward))
                 agent_sentence = yield TeacherAction(
-                    reward=reward, sentence="well done",
+                    reward=reward,
+                    sentence="well done",
                     done=self._end_episode_after_success,
                     success=True)
                 steps_since_last_reward = 0
@@ -975,15 +976,16 @@ def run(self):
                     else:
                         reward = 100.
                     agent_sentence = yield TeacherAction(
-                        reward=reward, sentence="well done", done=True,
+                        reward=reward,
+                        sentence="well done",
+                        done=True,
                         success=True)
                 else:
                     if self._sparse_reward:
                         reward = 0.
                     else:
                         reward = self._target_speed + 3 - dist / init_goal_dist
-                    agent_sentence = yield TeacherAction(
-                        reward=reward)
+                    agent_sentence = yield TeacherAction(reward=reward)
         yield TeacherAction(reward=-1.0, sentence="failed", done=True)
 
     def task_specific_observation(self, agent):
@@ -1204,7 +1206,9 @@ def run(self):
                 logging.debug("object has been successfuly placed")
                 reward = 200.0 if self._reward_shaping else 1.0
                 agent_sentence = yield TeacherAction(
-                    reward=reward, sentence="well done", done=True,
+                    reward=reward,
+                    sentence="well done",
+                    done=True,
                     success=True)
             else:
                 shaped_reward = max(

From 5aa122e5aea2adcae1f39fe957813905e33d33d9 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Fri, 31 Jul 2020 23:37:46 -0700
Subject: [PATCH 08/53] plot goal range in tensorboard

---
 python/social_bot/envs/play_ground.py |  8 +++++++-
 python/social_bot/tasks.py            | 26 ++++++++++++++++++++------
 python/social_bot/teacher.py          | 25 ++++++++++++++++++++-----
 3 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/python/social_bot/envs/play_ground.py b/python/social_bot/envs/play_ground.py
index d1b4a23d..b4a9a90a 100644
--- a/python/social_bot/envs/play_ground.py
+++ b/python/social_bot/envs/play_ground.py
@@ -174,7 +174,10 @@ def __init__(self,
 
         # Setup teacher and tasks
         self._teacher = teacher.Teacher(task_groups_exclusive=False)
+        self._has_goal_task = False
         for task in tasks:
+            if task == GoalTask:
+                self._has_goal_task = True
             task_group = TaskGroup()
             task_group.add_task(task(env=self, max_steps=max_steps))
             self._teacher.add_task_group(task_group)
@@ -260,7 +263,10 @@ def step(self, action):
         if teacher_action.done:
             logging.debug("episode ends at cum reward:" +
                           str(self._cum_reward))
-        return obs, reward, teacher_action.done, {"is_success": teacher_action.success}
+        info = {"is_success": teacher_action.success}
+        if self._has_goal_task:
+            info["goal_range"] = teacher_action.goal_range
+        return obs, reward, teacher_action.done, info
 
     def get_step_time(self):
         """
diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index d9b6c49f..4791333a 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -193,7 +193,7 @@ def __init__(self,
                  curriculum_distractions=True,
                  curriculum_target_angle=False,
                  switch_goal_within_episode=False,
-                 start_range=0,
+                 start_range=0.0,
                  increase_range_by_percent=50.,
                  reward_thresh_to_increase_range=0.4,
                  percent_full_range_in_curriculum=0.1,
@@ -423,11 +423,14 @@ def run(self):
                 if self._goal_conditioned:
                     reward -= 1.
                 logging.debug("yielding reward: " + str(reward))
+                done = self._end_episode_after_success
+                goal_range = self._random_range if done else 0
                 agent_sentence = yield TeacherAction(
                     reward=reward,
                     sentence="well done",
-                    done=self._end_episode_after_success,
-                    success=True)
+                    done=done,
+                    success=True,
+                    goal_range=goal_range)
                 steps_since_last_reward = 0
                 if self._switch_goal_within_episode:
                     self.pick_goal()
@@ -443,7 +446,10 @@ def run(self):
                     "yielding reward: {}, farther than {} from goal".format(
                         str(reward), str(self._fail_distance_thresh)))
                 yield TeacherAction(
-                    reward=reward, sentence="failed", done=True)
+                    reward=reward,
+                    sentence="failed",
+                    done=True,
+                    goal_range=self._random_range)
             else:
                 if self._sparse_reward:
                     reward = 0
@@ -458,8 +464,12 @@ def run(self):
                     self._push_reward_queue(0)
                     done = self.end_on_hitting_distraction
                 self._prev_dist = dist
+                goal_range = self._random_range if done else 0
                 agent_sentence = yield TeacherAction(
-                    reward=reward, sentence=self._goal_name, done=done)
+                    reward=reward,
+                    sentence=self._goal_name,
+                    done=done,
+                    goal_range=goal_range)
         reward = -1.0
         logging.debug("yielding reward: {}, took more than {} steps".format(
             str(reward), str(self._max_steps)))
@@ -467,7 +477,11 @@ def run(self):
         if self.should_use_curriculum_training():
             logging.debug("reward queue len: {}, sum: {}".format(
                 str(len(self._q)), str(sum(self._q))))
-        yield TeacherAction(reward=reward, sentence="failed", done=True)
+        yield TeacherAction(
+            reward=reward,
+            sentence="failed",
+            done=True,
+            goal_range=self._random_range)
 
     def _get_distraction_penalty(self, agent_loc, dot,
                                  prev_min_dist_to_distraction):
diff --git a/python/social_bot/teacher.py b/python/social_bot/teacher.py
index e9f4ea40..7c84dd9b 100644
--- a/python/social_bot/teacher.py
+++ b/python/social_bot/teacher.py
@@ -36,8 +36,13 @@ def __init__(self, vocab_size, max_length):
 
 
 class TeacherAction(object):
-    def __init__(self, reward=0.0, sentence="", done=False, is_idle=False,
-                 success=False):
+    def __init__(self,
+                 reward=0.0,
+                 sentence="",
+                 done=False,
+                 is_idle=False,
+                 success=False,
+                 goal_range=0.0):
         """
         Args:
             done: end of an episode if true
@@ -48,6 +53,7 @@ def __init__(self, reward=0.0, sentence="", done=False, is_idle=False,
         self.done = done
         self.is_idle = is_idle
         self.success = success
+        self.goal_range = float(goal_range)
 
 
 class TaskGroup(object):
@@ -86,6 +92,7 @@ def teach(self, agent_sentence):
             TeacherAction
         """
         task = self._get_current_task()
+        teacher_action = None
         try:
             # teacher_action is the value yielded in task
             teacher_action = task.send(agent_sentence)
@@ -98,7 +105,8 @@ def teach(self, agent_sentence):
             task.close()
             self._current_task = None
             self._is_idle = True
-            teacher_action = TeacherAction()
+            teacher_action = TeacherAction(
+                goal_range=teacher_action.goal_range if teacher_action else 0)
 
         return teacher_action
 
@@ -334,6 +342,7 @@ def teach(self, agent_sentence):
             done = False
             active_group_id = -1
             success = False
+            goal_range = 0
             # run all groups in parallel
             for i, g in enumerate(self._task_groups):
                 teacher_action = g.teach(agent_sentence)
@@ -341,6 +350,8 @@ def teach(self, agent_sentence):
                     done = True
                 if teacher_action.success:
                     success = True
+                if teacher_action.goal_range > 0:
+                    goal_range = teacher_action.goal_range
                 weight = g.get_current_reward_weight()
                 final_reward += weight * teacher_action.reward
                 if not final_sentence:
@@ -349,6 +360,10 @@ def teach(self, agent_sentence):
             if active_group_id != -1:
                 g = self._task_groups.pop(active_group_id)
                 self._task_groups.insert(0, g)
-            return_action = TeacherAction(final_reward, final_sentence, done,
-                                          success=success)
+            return_action = TeacherAction(
+                final_reward,
+                final_sentence,
+                done,
+                success=success,
+                goal_range=goal_range)
         return return_action

From 341460c26b24e3ccc78fd1be26fcedcefe62edef Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Mon, 3 Aug 2020 12:00:03 -0700
Subject: [PATCH 09/53] add comments for goal conditioned task

---
 python/social_bot/envs/play_ground.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/social_bot/envs/play_ground.py b/python/social_bot/envs/play_ground.py
index b4a9a90a..0a65a0a5 100644
--- a/python/social_bot/envs/play_ground.py
+++ b/python/social_bot/envs/play_ground.py
@@ -95,7 +95,12 @@ def __init__(self,
             world_name (string): Select the world file, e.g., empty.world, play_ground.world,
                 grocery_ground.world
             tasks (list): a list of teacher.Task, e.g., GoalTask, KickingBallTask
-            goal_conditioned (bool): Turn on goal conditioned tasks.
+            goal_conditioned (bool): Turn on goal conditioned tasks.  Currently only GoalTask
+                with full state observation has this mode enabled.  The observation will
+                become an OrderedDict including ``achieved_goal``, ``desired_goal`` and
+                ``observation`` fields, instead of a flat np array.  ``achieved_goal`` will
+                contain agent's current 2-d (x, y) position and ``desired_goal`` will be
+                the goal object's 2-d position.  Reward and task termination remain unchanged.
             with_language (bool): The observation will be a dict with an extra sentence
             with_agent_language (bool): Include agent sentence in action space.
             use_image_observation (bool): Use image, or use low-dimentional states as

From 36c369075421c485bced5b896fe72d4f33add22b Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Wed, 12 Aug 2020 22:16:59 -0700
Subject: [PATCH 10/53] properly return random_range always, to report
 goal_range metric (episodic mean)

---
 python/social_bot/tasks.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index 4791333a..9a3fcd8a 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -424,13 +424,12 @@ def run(self):
                     reward -= 1.
                 logging.debug("yielding reward: " + str(reward))
                 done = self._end_episode_after_success
-                goal_range = self._random_range if done else 0
                 agent_sentence = yield TeacherAction(
                     reward=reward,
                     sentence="well done",
                     done=done,
                     success=True,
-                    goal_range=goal_range)
+                    goal_range=self._random_range)
                 steps_since_last_reward = 0
                 if self._switch_goal_within_episode:
                     self.pick_goal()
@@ -464,12 +463,11 @@ def run(self):
                     self._push_reward_queue(0)
                     done = self.end_on_hitting_distraction
                 self._prev_dist = dist
-                goal_range = self._random_range if done else 0
                 agent_sentence = yield TeacherAction(
                     reward=reward,
                     sentence=self._goal_name,
                     done=done,
-                    goal_range=goal_range)
+                    goal_range=self._random_range)
         reward = -1.0
         logging.debug("yielding reward: {}, took more than {} steps".format(
             str(reward), str(self._max_steps)))

From 596e0d64b0b45292de136b3752a34e3c9aec2d40 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Fri, 14 Aug 2020 11:40:28 -0700
Subject: [PATCH 11/53] fix goal_range reporting to only report it once at
 episode end

---
 python/social_bot/tasks.py | 48 +++++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index 9a3fcd8a..299aea37 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -324,6 +324,7 @@ def __init__(self,
                 self._reward_thresh_to_increase_range)
         else:
             self._random_range = random_range
+        self._goal_dist = 0.
         obs_format = "image"
         if use_full_states or use_egocentric_states:
             obs_format = "full_state"
@@ -382,6 +383,19 @@ def _get_agent_loc(self):
         loc = np.array(loc)
         return loc, agent_dir
 
+    def _prepare_teacher_action(self, reward, sentence, done, success=False):
+        goal_dist = 0.
+        if done:
+            goal_dist = self._goal_dist
+            # clear self._goal_dist so it is only output once
+            self._goal_dist = 0.
+        return TeacherAction(
+            reward=reward,
+            sentence=sentence,
+            done=done,
+            success=success,
+            goal_range=goal_dist)
+
     def run(self):
         """ Start a teaching episode for this task. """
         agent_sentence = yield
@@ -424,12 +438,11 @@ def run(self):
                     reward -= 1.
                 logging.debug("yielding reward: " + str(reward))
                 done = self._end_episode_after_success
-                agent_sentence = yield TeacherAction(
+                agent_sentence = yield self._prepare_teacher_action(
                     reward=reward,
                     sentence="well done",
                     done=done,
-                    success=True,
-                    goal_range=self._random_range)
+                    success=True)
                 steps_since_last_reward = 0
                 if self._switch_goal_within_episode:
                     self.pick_goal()
@@ -444,11 +457,8 @@ def run(self):
                 logging.debug(
                     "yielding reward: {}, farther than {} from goal".format(
                         str(reward), str(self._fail_distance_thresh)))
-                yield TeacherAction(
-                    reward=reward,
-                    sentence="failed",
-                    done=True,
-                    goal_range=self._random_range)
+                yield self._prepare_teacher_action(
+                    reward=reward, sentence="failed", done=True)
             else:
                 if self._sparse_reward:
                     reward = 0
@@ -463,11 +473,8 @@ def run(self):
                     self._push_reward_queue(0)
                     done = self.end_on_hitting_distraction
                 self._prev_dist = dist
-                agent_sentence = yield TeacherAction(
-                    reward=reward,
-                    sentence=self._goal_name,
-                    done=done,
-                    goal_range=self._random_range)
+                agent_sentence = yield self._prepare_teacher_action(
+                    reward=reward, sentence=self._goal_name, done=done)
         reward = -1.0
         logging.debug("yielding reward: {}, took more than {} steps".format(
             str(reward), str(self._max_steps)))
@@ -475,11 +482,8 @@ def run(self):
         if self.should_use_curriculum_training():
             logging.debug("reward queue len: {}, sum: {}".format(
                 str(len(self._q)), str(sum(self._q))))
-        yield TeacherAction(
-            reward=reward,
-            sentence="failed",
-            done=True,
-            goal_range=self._random_range)
+        yield self._prepare_teacher_action(
+            reward=reward, sentence="failed", done=True)
 
     def _get_distraction_penalty(self, agent_loc, dot,
                                  prev_min_dist_to_distraction):
@@ -516,12 +520,13 @@ def _move_goal(self, goal, agent_loc, agent_dir):
         Move goal as well as all distraction objects to a random location.
         """
         avoid_locations = [agent_loc]
-        loc = self._move_obj(
+        loc, dist = self._move_obj(
             obj=goal,
             agent_loc=agent_loc,
             agent_dir=agent_dir,
             is_goal=True,
             avoid_locations=avoid_locations)
+        self._goal_dist += dist
         avoid_locations.append(loc)
         distractions = OrderedDict()
         for item in self._distraction_list:
@@ -530,7 +535,7 @@ def _move_goal(self, goal, agent_loc, agent_dir):
         if len(distractions) and self._curriculum_distractions:
             for item, _ in distractions.items():
                 distraction = self._world.get_agent(item)
-                loc = self._move_obj(
+                loc, _ = self._move_obj(
                     obj=distraction,
                     agent_loc=agent_loc,
                     agent_dir=agent_dir,
@@ -553,6 +558,7 @@ def _move_obj(self,
             range = self._random_range
             self._is_full_range_in_curriculum = False
         attempts = 0
+        dist = range
         while True:
             attempts += 1
             dist = random.random() * range
@@ -592,7 +598,7 @@ def _move_obj(self,
         self._prev_dist = self._initial_dist
         obj.reset()
         obj.set_pose((loc, (0, 0, 0)))
-        return loc
+        return loc, dist
 
     def _random_move_objects(self, random_range=10.0):
         obj_num = len(self._object_list)

From 5ef28e97ca7e64564cef85cc1a79e59dfbf065ff Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Mon, 24 Aug 2020 23:10:28 -0700
Subject: [PATCH 12/53] terminate gzclient at env close(), fix bug where
 direction not ignored when flag says to ignore goal direction

---
 python/social_bot/envs/gazebo_base.py | 1 +
 python/social_bot/tasks.py            | 9 ++++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/social_bot/envs/gazebo_base.py b/python/social_bot/envs/gazebo_base.py
index 4dffc92c..a7c94f59 100644
--- a/python/social_bot/envs/gazebo_base.py
+++ b/python/social_bot/envs/gazebo_base.py
@@ -134,6 +134,7 @@ def render(self, mode='human'):
     def close(self):
         super().close()
         gazebo.close_without_model_base_fini()
+        self.__del__()
 
     def insert_model(self, model, name=None, pose="0 0 0 0 0 0"):
         """
diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index 299aea37..398caf87 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -396,6 +396,9 @@ def _prepare_teacher_action(self, reward, sentence, done, success=False):
             success=success,
             goal_range=goal_dist)
 
+    def _within_angle(self, dot):
+        return (not self._success_with_angle_requirement) or dot > 0.707
+
     def run(self):
         """ Start a teaching episode for this task. """
         agent_sentence = yield
@@ -429,8 +432,8 @@ def run(self):
 
             # TODO(Le): compare achieved goal with desired goal if task is
             # goal conditioned?
-            if dist < self._success_distance_thresh and (
-                    not self._success_with_angle_requirement or dot > 0.707):
+            if dist < self._success_distance_thresh and self._within_angle(
+                    dot):
                 # within 45 degrees of the agent direction
                 reward = 1.0 - distraction_penalty
                 self._push_reward_queue(max(reward, 0))
@@ -504,7 +507,7 @@ def _get_distraction_penalty(self, agent_loc, dot,
                 if (distraction_dist >=
                         self._distraction_penalty_distance_thresh):
                     continue
-                if obj_name == self._goal_name and dot > 0.707:
+                if obj_name == self._goal_name and self._within_angle(dot):
                     continue  # correctly getting to goal, no penalty
                 if distraction_dist < curr_min_dist:
                     curr_min_dist = distraction_dist

From b25a51c1ab6fb36d62df02f8eba6900b575a23eb Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Thu, 27 Aug 2020 15:38:08 -0700
Subject: [PATCH 13/53] address comments

---
 python/social_bot/envs/gazebo_base.py |  7 ++---
 python/social_bot/envs/play_ground.py |  6 ++++-
 python/social_bot/tasks.py            | 37 +++++----------------------
 3 files changed, 14 insertions(+), 36 deletions(-)

diff --git a/python/social_bot/envs/gazebo_base.py b/python/social_bot/envs/gazebo_base.py
index a7c94f59..e698d0f3 100644
--- a/python/social_bot/envs/gazebo_base.py
+++ b/python/social_bot/envs/gazebo_base.py
@@ -134,7 +134,8 @@ def render(self, mode='human'):
     def close(self):
         super().close()
         gazebo.close_without_model_base_fini()
-        self.__del__()
+        if self._rendering_process is not None:
+            self._rendering_process.terminate()
 
     def insert_model(self, model, name=None, pose="0 0 0 0 0 0"):
         """
@@ -194,10 +195,6 @@ def seed(self, seed=None):
         """Gym interface for setting random seed."""
         random.seed(seed)
 
-    def __del__(self):
-        if self._rendering_process is not None:
-            self._rendering_process.terminate()
-
 
 def _modify_world_xml(xml, modifications):
     """Modify world xml content
diff --git a/python/social_bot/envs/play_ground.py b/python/social_bot/envs/play_ground.py
index 0a65a0a5..214c62ac 100644
--- a/python/social_bot/envs/play_ground.py
+++ b/python/social_bot/envs/play_ground.py
@@ -100,7 +100,9 @@ def __init__(self,
                 become an OrderedDict including ``achieved_goal``, ``desired_goal`` and
                 ``observation`` fields, instead of a flat np array.  ``achieved_goal`` will
                 contain agent's current 2-d (x, y) position and ``desired_goal`` will be
-                the goal object's 2-d position.  Reward and task termination remain unchanged.
+                the goal object's 2-d position.  Reward becomes 0 for reaching goal, -1 for
+                any other step (assuming no distraction penalty).  Task termination remains
+                unchanged.
             with_language (bool): The observation will be a dict with an extra sentence
             with_agent_language (bool): Include agent sentence in action space.
             use_image_observation (bool): Use image, or use low-dimentional states as
@@ -199,9 +201,11 @@ def __init__(self,
         self.observation_space = self._agent.get_observation_space(
             self._teacher)
         if self._goal_conditioned:
+            assert agent_type == 'pioneer2dx_noplugin'
             assert not with_language
             assert not use_image_observation
             assert not image_with_internal_states
+            assert len(tasks) == 1 and tasks[0] == GoalTask
             # state observation
             goal_space = gym.spaces.Box(
                 low=-np.inf, high=np.inf, shape=(2, ), dtype=np.float32)
diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index 398caf87..05ab2d6a 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -204,8 +204,7 @@ def __init__(self,
                  additional_observation_list=[],
                  use_full_states=False,
                  use_egocentric_states=False,
-                 egocentric_perception_range=0,
-                 order_obj_by_view=False):
+                 egocentric_perception_range=0):
         """
         Args:
             env (gym.Env): an instance of Environment
@@ -219,7 +218,8 @@ def __init__(self,
             fail_distance_thresh (float): if the agent moves away from the goal more than this distance,
                 it's considered a failure and is given reward -1
             distraction_penalty_distance_thresh (float): if positive, penalize agent getting too close
-                to distraction objects (objects that are not the goal itself)
+                to distraction objects (objects that include the goal itself, as approaching goal without
+                facing it is considered hitting a distraction)
             distraction_penalty (float): positive float of how much to penalize getting too close to
                 distraction objects
             random_agent_orientation (bool): whether randomize the orientation (yaw) of the agent at the beginning of an
@@ -254,8 +254,6 @@ def __init__(self,
             egocentric_perception_range (float): the max range in degree to limit the agent's observation.
                 E.g. 60 means object is only visible when it's within +/-60 degrees in front of the agent's
                 direction (yaw).
-            order_obj_by_view (bool): False to order object coordinates (full states) in object id order;
-                True to order according to y axis value (left to right) together with object id.
         """
         self._max_play_ground_size = 5  # play ground will be (-5, 5) for both x and y axes.
         # TODO: Remove the default grey walls in the play ground world file,
@@ -306,7 +304,6 @@ def __init__(self,
         self._use_full_states = use_full_states
         self._use_egocentric_states = use_egocentric_states
         self._egocentric_perception_range = egocentric_perception_range
-        self._order_obj_by_view = order_obj_by_view
         if self.should_use_curriculum_training():
             self._orig_random_range = random_range
             self._random_range = start_range
@@ -333,11 +330,10 @@ def __init__(self,
             obs_relative = "absolute"
         logging.info("Observations: {}, {}.".format(obs_format, obs_relative))
         if use_full_states and not use_egocentric_states:
-            if not order_obj_by_view:
-                logging.info(
-                    "Dims: 0-5: agent's velocity and angular " +
-                    "velocity, 6-11: agent's position and pose, 12-13: goal x, y"
-                    + ", all distractions' x, y coordinates.")
+            logging.info(
+                "Dims: 0-5: agent's velocity and angular " +
+                "velocity, 6-11: agent's position and pose, 12-13: goal x, y" +
+                ", all distractions' x, y coordinates.")
         self.task_vocab += self._object_list
         self._env.insert_model_list(self._object_list)
 
@@ -655,7 +651,6 @@ def task_specific_observation(self, agent):
             else:
                 pose = np.concatenate((pose, obj_pos), axis=0)
 
-        obj_array = []
         agent_pose = np.array(agent.get_pose()).flatten()
         if self._use_full_states or self._use_egocentric_states:
             yaw = agent_pose[5]
@@ -666,18 +661,6 @@ def task_specific_observation(self, agent):
             else:
                 rvx, rvy = vx, vy
             obs = [rvx, rvy, vz, a1, a2, a3] + list(agent_pose)
-            if self._egocentric_perception_range > 0:
-                obj_array.extend(
-                    [[0, rvx, rvy, vz], [1, a1, a2, a3],
-                     [2, agent_pose[0], agent_pose[1], agent_pose[2]],
-                     [3, agent_pose[3], agent_pose[4], agent_pose[5]]])
-                i = 4
-            else:
-                obj_array.extend([[0, rvx, rvy], [1, vz, a1], [2, a2, a3],
-                                  [3, agent_pose[0], agent_pose[1]],
-                                  [4, agent_pose[2], agent_pose[3]],
-                                  [5, agent_pose[4], agent_pose[5]]])
-                i = 6
             # adds objects' (goal's as well as distractions') egocentric
             # coordinates to observation
             while len(pose) > 1:
@@ -701,10 +684,8 @@ def task_specific_observation(self, agent):
                         rotated_x = 0.
                         rotated_y = 0.
                         magnitude = 0.
-                    obj_array.append([i, rotated_x, rotated_y, magnitude])
                     obs.extend([rotated_x, rotated_y, magnitude])
                 else:
-                    obj_array.append([i, rotated_x, rotated_y])
                     obs.extend([rotated_x, rotated_y])
                 pose = pose[3:]
             obs = np.array(obs)
@@ -714,10 +695,6 @@ def task_specific_observation(self, agent):
             obs = np.concatenate((pose, agent_pose, agent_vel, joints_states),
                                  axis=0)
 
-        if self._order_obj_by_view:
-            # TODO: skip first few dims corresponding to rvx, a1 etc..
-            obj_array.sort(key=operator.itemgetter(2))
-            obs = np.array(obj_array).flatten()
         return obs
 
 

From 9f3051d581b97c036a40fbc45ab0942ceeaff973 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Fri, 28 Aug 2020 14:40:01 -0700
Subject: [PATCH 14/53] make "human" rendering consistent with "rgb_array" -
 video recording

---
 python/social_bot/envs/gazebo_base.py | 70 +++++++++++++--------------
 python/social_bot/envs/play_ground.py |  9 +++-
 2 files changed, 43 insertions(+), 36 deletions(-)

diff --git a/python/social_bot/envs/gazebo_base.py b/python/social_bot/envs/gazebo_base.py
index e698d0f3..595388d1 100644
--- a/python/social_bot/envs/gazebo_base.py
+++ b/python/social_bot/envs/gazebo_base.py
@@ -82,6 +82,41 @@ def render(self, mode='human'):
         Args:
             mode (str): 'human' and 'rgb_array' is supported.
         """
+        if self._rendering_camera is None:
+            render_camera_sdf = """
+            <?xml version='1.0'?>
+            <sdf version ='1.6'>
+            <model name ='render_camera'>
+                <static>1</static>
+                <pose>%s</pose>
+                <link name="link">
+                    <sensor name="camera" type="camera">
+                        <camera>
+                        <horizontal_fov>0.95</horizontal_fov>
+                        <image>
+                            <width>640</width>
+                            <height>480</height>
+                        </image>
+                        <clip>
+                            <near>0.1</near>
+                            <far>100</far>
+                        </clip>
+                        </camera>
+                        <always_on>1</always_on>
+                        <update_rate>30</update_rate>
+                        <visualize>true</visualize>
+                    </sensor>
+                </link>
+            </model>
+            </sdf>
+            """
+            render_camera_sdf = render_camera_sdf % self._rendering_cam_pose
+            self._world.insertModelFromSdfString(render_camera_sdf)
+            time.sleep(0.2)
+            self._world.step(20)
+            self._rendering_camera = self._world.get_agent('render_camera')
+        image = self._rendering_camera.get_camera_observation(
+            "default::render_camera::link::camera")
         if mode == 'human':
             if self._rendering_process is None:
                 from subprocess import Popen
@@ -91,41 +126,6 @@ def render(self, mode='human'):
                 self._rendering_process = Popen(['gzclient'])
             return
         if mode == 'rgb_array':
-            if self._rendering_camera is None:
-                render_camera_sdf = """
-                <?xml version='1.0'?>
-                <sdf version ='1.6'>
-                <model name ='render_camera'>
-                    <static>1</static>
-                    <pose>%s</pose>
-                    <link name="link">
-                        <sensor name="camera" type="camera">
-                            <camera>
-                            <horizontal_fov>0.95</horizontal_fov>
-                            <image>
-                                <width>640</width>
-                                <height>480</height>
-                            </image>
-                            <clip>
-                                <near>0.1</near>
-                                <far>100</far>
-                            </clip>
-                            </camera>
-                            <always_on>1</always_on>
-                            <update_rate>30</update_rate>
-                            <visualize>true</visualize>
-                        </sensor>
-                    </link>
-                </model>
-                </sdf>
-                """
-                render_camera_sdf = render_camera_sdf % self._rendering_cam_pose
-                self._world.insertModelFromSdfString(render_camera_sdf)
-                time.sleep(0.2)
-                self._world.step(20)
-                self._rendering_camera = self._world.get_agent('render_camera')
-            image = self._rendering_camera.get_camera_observation(
-                "default::render_camera::link::camera")
             return np.array(image)
 
         raise NotImplementedError("rendering mode: " + mode +
diff --git a/python/social_bot/envs/play_ground.py b/python/social_bot/envs/play_ground.py
index 214c62ac..3f891b84 100644
--- a/python/social_bot/envs/play_ground.py
+++ b/python/social_bot/envs/play_ground.py
@@ -218,8 +218,15 @@ def _get_observation(self, teacher_sentence):
         obs = self._agent.get_observation(self._teacher, teacher_sentence)
         if self._goal_conditioned:
             flat_obs = obs
-            obs = OrderedDict(observation=obs)
+            obs = OrderedDict()
+            obs['observation'] = flat_obs  # flat_obs[15:]
             obs['achieved_goal'] = flat_obs[6:8]
+            # We can use this field to relabel future state in goal conditioned
+            # policy on top of the achieved_goal field.
+            # High level needs to fill this in for the desired_goal.
+            # When to use real reward, when to use fake reward?
+            # obs['aux_achieved'] = np.concatenate(
+            #     (flat_obs[:6], flat_obs[8:12]), axis=0)
             obs['desired_goal'] = flat_obs[12:14]
         return obs
 

From 8281a65592deb7b2bd6c20cdd24c8fbeeb6e1c86 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Thu, 3 Sep 2020 23:36:54 -0700
Subject: [PATCH 15/53] use aux achieved field in goal conditioned GoalTask.

---
 pygazebo/pygazebo.cc                  | 15 ++++++++-
 python/social_bot/envs/gazebo_base.py |  8 +++--
 python/social_bot/envs/play_ground.py | 44 ++++++++++++++++++++-------
 3 files changed, 52 insertions(+), 15 deletions(-)

diff --git a/pygazebo/pygazebo.cc b/pygazebo/pygazebo.cc
index aaafa439..c9ac1548 100644
--- a/pygazebo/pygazebo.cc
+++ b/pygazebo/pygazebo.cc
@@ -507,6 +507,19 @@ void CloseWithoutModelbaseFini() {
   gazebo_sensor_initialized = false;
 }
 
+void CloseWithoutFini() {
+  // this causes segmentation fault when run.
+  gazebo::physics::stop_worlds();
+  gazebo::sensors::stop();
+  gazebo::util::LogRecord::Instance()->Stop();
+  gazebo::transport::stop();
+
+  gazebo::transport::fini();
+
+  gazebo_initialized = false;
+  gazebo_sensor_initialized = false;
+}
+
 void Close() {
   gazebo::shutdown();
   gazebo_initialized = false;
@@ -573,7 +586,7 @@ PYBIND11_MODULE(pygazebo, m) {
 
   m.def("close_without_model_base_fini",
         &CloseWithoutModelbaseFini,
-        "A customized close function without execute ModelbaseFini"
+        "A customized close function without execute ModelbaseFini. "
         "For some unknwon reason, ModelbaseFini() in the gazebo.shutdown() "
         "makes the"
         "process fail to exit when the environment is wrapped with process");
diff --git a/python/social_bot/envs/gazebo_base.py b/python/social_bot/envs/gazebo_base.py
index 595388d1..d55df8ac 100644
--- a/python/social_bot/envs/gazebo_base.py
+++ b/python/social_bot/envs/gazebo_base.py
@@ -58,7 +58,9 @@ def __init__(self,
         if port is None:
             port = 0
         self._port = port
-        # to avoid different parallel simulation has the same randomness
+        # This avoids different parallel simulations having the same randomness.
+        # When calling from alf, alf.environments.utils.create_environment calls
+        # env.seed() afterwards, which is the real seed being used.  Not this one.
         random.seed(port)
         self._rendering_process = None
         self._rendering_camera = None
@@ -115,8 +117,6 @@ def render(self, mode='human'):
             time.sleep(0.2)
             self._world.step(20)
             self._rendering_camera = self._world.get_agent('render_camera')
-        image = self._rendering_camera.get_camera_observation(
-            "default::render_camera::link::camera")
         if mode == 'human':
             if self._rendering_process is None:
                 from subprocess import Popen
@@ -126,6 +126,8 @@ def render(self, mode='human'):
                 self._rendering_process = Popen(['gzclient'])
             return
         if mode == 'rgb_array':
+            image = self._rendering_camera.get_camera_observation(
+                "default::render_camera::link::camera")
             return np.array(image)
 
         raise NotImplementedError("rendering mode: " + mode +
diff --git a/python/social_bot/envs/play_ground.py b/python/social_bot/envs/play_ground.py
index 3f891b84..e1c77253 100644
--- a/python/social_bot/envs/play_ground.py
+++ b/python/social_bot/envs/play_ground.py
@@ -73,6 +73,7 @@ def __init__(self,
                  world_name="play_ground.world",
                  tasks=[GoalTask],
                  goal_conditioned=False,
+                 use_aux_achieved=False,
                  with_language=False,
                  with_agent_language=False,
                  use_image_observation=False,
@@ -103,6 +104,8 @@ def __init__(self,
                 the goal object's 2-d position.  Reward becomes 0 for reaching goal, -1 for
                 any other step (assuming no distraction penalty).  Task termination remains
                 unchanged.
+            use_aux_achieved (bool): if True, pull out speed, pose dimensions into a separate
+                field: aux_achieved.  Only valid when goal_conditioned is True.
             with_language (bool): The observation will be a dict with an extra sentence
             with_agent_language (bool): Include agent sentence in action space.
             use_image_observation (bool): Use image, or use low-dimentional states as
@@ -138,6 +141,7 @@ def __init__(self,
 
         self._action_cost = action_cost
         self._goal_conditioned = goal_conditioned
+        self._use_aux_achieved = use_aux_achieved
         self._with_language = with_language
         self._seq_length = vocab_sequence_length
         self._with_agent_language = with_language and with_agent_language
@@ -207,27 +211,45 @@ def __init__(self,
             assert not image_with_internal_states
             assert len(tasks) == 1 and tasks[0] == GoalTask
             # state observation
+            goal_shape = 2
             goal_space = gym.spaces.Box(
-                low=-np.inf, high=np.inf, shape=(2, ), dtype=np.float32)
-            self.observation_space = gym.spaces.Dict(
+                low=-np.inf,
+                high=np.inf,
+                shape=(goal_shape, ),
+                dtype=np.float32)
+            d = OrderedDict(
                 observation=self.observation_space,
-                desired_goal=goal_space,
-                achieved_goal=goal_space)
+                achieved_goal=goal_space,
+                desired_goal=goal_space)
+            if use_aux_achieved:
+                ob_shape = self.observation_space.shape[0]
+                aux_shape = 10
+                ob_space = gym.spaces.Box(
+                    low=-np.inf,
+                    high=np.inf,
+                    shape=(ob_shape - aux_shape - goal_shape * 2, ),
+                    dtype=np.float32)
+                aux_space = gym.spaces.Box(
+                    low=-np.inf,
+                    high=np.inf,
+                    shape=(aux_shape, ),
+                    dtype=np.float32)
+                d["observation"] = ob_space
+                d["aux_achieved"] = aux_space
+            self.observation_space = gym.spaces.Dict(**d)
 
     def _get_observation(self, teacher_sentence):
         obs = self._agent.get_observation(self._teacher, teacher_sentence)
         if self._goal_conditioned:
             flat_obs = obs
             obs = OrderedDict()
-            obs['observation'] = flat_obs  # flat_obs[15:]
+            obs['observation'] = flat_obs
             obs['achieved_goal'] = flat_obs[6:8]
-            # We can use this field to relabel future state in goal conditioned
-            # policy on top of the achieved_goal field.
-            # High level needs to fill this in for the desired_goal.
-            # When to use real reward, when to use fake reward?
-            # obs['aux_achieved'] = np.concatenate(
-            #     (flat_obs[:6], flat_obs[8:12]), axis=0)
             obs['desired_goal'] = flat_obs[12:14]
+            if self._use_aux_achieved:
+                obs['observation'] = flat_obs[14:]
+                obs['aux_achieved'] = np.concatenate(
+                    (flat_obs[:6], flat_obs[8:12]), axis=0)
         return obs
 
     def reset(self):

From 9bd0d528ae0f08e63c56be1de7a48e4e3a90aed6 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Fri, 4 Sep 2020 15:44:45 -0700
Subject: [PATCH 16/53] add the option to only plan x-y related dimensions in
 aux_achieved

---
 python/social_bot/envs/play_ground.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/python/social_bot/envs/play_ground.py b/python/social_bot/envs/play_ground.py
index e1c77253..6abd8cff 100644
--- a/python/social_bot/envs/play_ground.py
+++ b/python/social_bot/envs/play_ground.py
@@ -74,6 +74,7 @@ def __init__(self,
                  tasks=[GoalTask],
                  goal_conditioned=False,
                  use_aux_achieved=False,
+                 xy_only_aux=False,
                  with_language=False,
                  with_agent_language=False,
                  use_image_observation=False,
@@ -106,6 +107,8 @@ def __init__(self,
                 unchanged.
             use_aux_achieved (bool): if True, pull out speed, pose dimensions into a separate
                 field: aux_achieved.  Only valid when goal_conditioned is True.
+            xy_only_aux (bool): exclude irrelevant dimensions (z-axis movements) from
+                aux_achieved field.
             with_language (bool): The observation will be a dict with an extra sentence
             with_agent_language (bool): Include agent sentence in action space.
             use_image_observation (bool): Use image, or use low-dimentional states as
@@ -142,6 +145,7 @@ def __init__(self,
         self._action_cost = action_cost
         self._goal_conditioned = goal_conditioned
         self._use_aux_achieved = use_aux_achieved
+        self._xy_only_aux = xy_only_aux
         self._with_language = with_language
         self._seq_length = vocab_sequence_length
         self._with_agent_language = with_language and with_agent_language
@@ -224,6 +228,8 @@ def __init__(self,
             if use_aux_achieved:
                 ob_shape = self.observation_space.shape[0]
                 aux_shape = 10
+                if self._xy_only_aux:
+                    aux_shape = 4
                 ob_space = gym.spaces.Box(
                     low=-np.inf,
                     high=np.inf,
@@ -250,6 +256,14 @@ def _get_observation(self, teacher_sentence):
                 obs['observation'] = flat_obs[14:]
                 obs['aux_achieved'] = np.concatenate(
                     (flat_obs[:6], flat_obs[8:12]), axis=0)
+                if self._xy_only_aux:
+                    # 2: z-speed, 3, 4: angular velocity, 5: yaw-vel, 6, 7: x, y, 8: z, 9, 10, 11: pose
+                    obs['observation'] = np.concatenate(
+                        (flat_obs[2:5], flat_obs[8:11], flat_obs[14:]), axis=0)
+                    obs['aux_achieved'] = np.concatenate(
+                        (flat_obs[0:2], np.expand_dims(flat_obs[5], 0),
+                         np.expand_dims(flat_obs[11], 0)),
+                        axis=0)
         return obs
 
     def reset(self):

From a94601fb80820aaf5e4831e9e79fea3b6c0bb2f1 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Tue, 8 Sep 2020 23:18:45 -0700
Subject: [PATCH 17/53] multi dimensional reward for goal conditioned goal task

---
 python/social_bot/envs/play_ground.py |   3 +
 python/social_bot/tasks.py            | 102 ++++++++++++++++++++------
 python/social_bot/teacher.py          |  14 +++-
 3 files changed, 96 insertions(+), 23 deletions(-)

diff --git a/python/social_bot/envs/play_ground.py b/python/social_bot/envs/play_ground.py
index 6abd8cff..dab8441e 100644
--- a/python/social_bot/envs/play_ground.py
+++ b/python/social_bot/envs/play_ground.py
@@ -318,6 +318,9 @@ def step(self, action):
         info = {"is_success": teacher_action.success}
         if self._has_goal_task:
             info["goal_range"] = teacher_action.goal_range
+        # Maybe use multi dimensional reward
+        if teacher_action.rewards is not None:
+            reward = teacher_action.rewards
         return obs, reward, teacher_action.done, info
 
     def get_step_time(self):
diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index 05ab2d6a..0f0242cd 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -178,8 +178,10 @@ def __init__(self,
                      'coke_can', 'table', 'car_wheel', 'plastic_cup', 'beer'
                  ],
                  goal_conditioned=False,
+                 multi_dim_reward=False,
                  end_on_hitting_distraction=False,
                  end_episode_after_success=False,
+                 reset_time_limit_on_success=True,
                  success_distance_thresh=0.5,
                  fail_distance_thresh=2.0,
                  distraction_penalty_distance_thresh=0,
@@ -211,9 +213,13 @@ def __init__(self,
             max_steps (int): episode will end if not reaching gaol in so many steps
             goal_name (string): name of the goal in the world
             distraction_list (list of string): a list of model. the model shoud be in gazebo database
+            goal_conditioned (bool): if True, each step has -1 reward, unless at goal state, which gives 0.
+            multi_dim_reward (bool): if True, separate goal reward and distraction penalty into two dimensions.
             end_episode_after_success (bool): if True, the episode will end once the goal is reached. A True value of this
                 flag will overwrite the effects of flags ``switch_goal_within_episode`` and ``move_goal_during_episode``.
             end_on_hitting_distraction (bool): whether to end episode on hitting distraction
+            reset_time_limit_on_success (bool): if not ending after success, if hit success before time limit,
+                reset clock to 0.
             success_distance_thresh (float): the goal is reached if it's within this distance to the agent
             fail_distance_thresh (float): if the agent moves away from the goal more than this distance,
                 it's considered a failure and is given reward -1
@@ -263,7 +269,10 @@ def __init__(self,
             env=env, max_steps=max_steps, reward_weight=reward_weight)
         self._goal_name = goal_name
         self._goal_conditioned = goal_conditioned
+        self._multi_dim_reward = multi_dim_reward
         self.end_on_hitting_distraction = end_on_hitting_distraction
+        self._end_episode_after_success = end_episode_after_success
+        self._reset_time_limit_on_success = reset_time_limit_on_success
         self._success_distance_thresh = success_distance_thresh
         self._fail_distance_thresh = fail_distance_thresh
         self._distraction_penalty_distance_thresh = distraction_penalty_distance_thresh
@@ -289,7 +298,6 @@ def __init__(self,
             self._object_list.append(goal_name)
         self._goals = self._object_list
         self._move_goal_during_episode = move_goal_during_episode
-        self._end_episode_after_success = end_episode_after_success
         self._success_with_angle_requirement = success_with_angle_requirement
         if not additional_observation_list:
             additional_observation_list = self._object_list
@@ -379,8 +387,15 @@ def _get_agent_loc(self):
         loc = np.array(loc)
         return loc, agent_dir
 
-    def _prepare_teacher_action(self, reward, sentence, done, success=False):
+    def _prepare_teacher_action(self,
+                                reward,
+                                sentence,
+                                done,
+                                success=False,
+                                rewards=None):
         goal_dist = 0.
+        if rewards is not None:
+            rewards = rewards.astype(np.float32)
         if done:
             goal_dist = self._goal_dist
             # clear self._goal_dist so it is only output once
@@ -390,11 +405,23 @@ def _prepare_teacher_action(self, reward, sentence, done, success=False):
             sentence=sentence,
             done=done,
             success=success,
-            goal_range=goal_dist)
+            goal_range=goal_dist,
+            rewards=rewards)
 
     def _within_angle(self, dot):
         return (not self._success_with_angle_requirement) or dot > 0.707
 
+    def _get_goal_dist(self, goal):
+        loc, agent_dir = self._get_agent_loc()
+        goal_loc, _ = goal.get_pose()
+        goal_loc = np.array(goal_loc)
+        dist = np.linalg.norm(loc - goal_loc)
+        # dir from get_pose is (roll, pitch, yaw)
+        dir = np.array([math.cos(agent_dir[2]), math.sin(agent_dir[2])])
+        goal_dir = (goal_loc[0:2] - loc[0:2]) / dist
+        dot = sum(dir * goal_dir)
+        return dist, dot
+
     def run(self):
         """ Start a teaching episode for this task. """
         agent_sentence = yield
@@ -411,21 +438,13 @@ def run(self):
         self._move_goal(goal, loc, agent_dir)
         steps_since_last_reward = 0
         prev_min_dist_to_distraction = 100
+        rewards = None  # reward array in multi_dim_reward case
         while steps_since_last_reward < self._max_steps:
             steps_since_last_reward += 1
-            loc, agent_dir = self._get_agent_loc()
-            goal_loc, _ = goal.get_pose()
-            goal_loc = np.array(goal_loc)
-            dist = np.linalg.norm(loc - goal_loc)
-            # dir from get_pose is (roll, pitch, yaw)
-            dir = np.array([math.cos(agent_dir[2]), math.sin(agent_dir[2])])
-            goal_dir = (goal_loc[0:2] - loc[0:2]) / dist
-            dot = sum(dir * goal_dir)
-
+            dist, dot = self._get_goal_dist(goal)
             distraction_penalty, prev_min_dist_to_distraction = (
                 self._get_distraction_penalty(loc, dot,
                                               prev_min_dist_to_distraction))
-
             # TODO(Le): compare achieved goal with desired goal if task is
             # goal conditioned?
             if dist < self._success_distance_thresh and self._within_angle(
@@ -435,14 +454,21 @@ def run(self):
                 self._push_reward_queue(max(reward, 0))
                 if self._goal_conditioned:
                     reward -= 1.
+                    if self._multi_dim_reward:
+                        rewards = np.array([0, -distraction_penalty])
+                else:
+                    if self._multi_dim_reward:
+                        rewards = np.array([1, -distraction_penalty])
                 logging.debug("yielding reward: " + str(reward))
                 done = self._end_episode_after_success
                 agent_sentence = yield self._prepare_teacher_action(
                     reward=reward,
                     sentence="well done",
                     done=done,
-                    success=True)
-                steps_since_last_reward = 0
+                    success=True,
+                    rewards=rewards)
+                if self._reset_time_limit_on_success:
+                    steps_since_last_reward = 0
                 if self._switch_goal_within_episode:
                     self.pick_goal()
                     goal = self._world.get_agent(self._goal_name)
@@ -456,8 +482,13 @@ def run(self):
                 logging.debug(
                     "yielding reward: {}, farther than {} from goal".format(
                         str(reward), str(self._fail_distance_thresh)))
+                if self._multi_dim_reward:
+                    rewards = np.array([-1, -distraction_penalty])
                 yield self._prepare_teacher_action(
-                    reward=reward, sentence="failed", done=True)
+                    reward=reward,
+                    sentence="failed",
+                    done=True,
+                    rewards=rewards)
             else:
                 if self._sparse_reward:
                     reward = 0
@@ -465,6 +496,8 @@ def run(self):
                         reward = -1
                 else:
                     reward = (self._prev_dist - dist) / self._initial_dist
+                if self._multi_dim_reward:
+                    rewards = np.array([reward, -distraction_penalty])
                 reward = reward - distraction_penalty
                 done = False
                 if distraction_penalty > 0:
@@ -473,16 +506,43 @@ def run(self):
                     done = self.end_on_hitting_distraction
                 self._prev_dist = dist
                 agent_sentence = yield self._prepare_teacher_action(
-                    reward=reward, sentence=self._goal_name, done=done)
+                    reward=reward,
+                    sentence=self._goal_name,
+                    done=done,
+                    rewards=rewards)
         reward = -1.0
-        logging.debug("yielding reward: {}, took more than {} steps".format(
-            str(reward), str(self._max_steps)))
-        self._push_reward_queue(0)
+        dist, dot = self._get_goal_dist(goal)
+        distraction_penalty, prev_min_dist_to_distraction = (
+            self._get_distraction_penalty(loc, dot,
+                                          prev_min_dist_to_distraction))
+        # TODO(Le): compare achieved goal with desired goal if task is
+        # goal conditioned?
+        success = False
+        if dist < self._success_distance_thresh and self._within_angle(dot):
+            success = True
+            reward = 1.0 - distraction_penalty
+            self._push_reward_queue(max(reward, 0))
+            if self._goal_conditioned:
+                reward -= 1.
+                if self._multi_dim_reward:
+                    rewards = np.array([0, -distraction_penalty])
+            else:
+                if self._multi_dim_reward:
+                    rewards = np.array([1, -distraction_penalty])
+        else:
+            self._push_reward_queue(0)
+            logging.debug("took more than {} steps".format(
+                str(self._max_steps)))
+        logging.debug("yielding reward: {}".format(str(reward)))
         if self.should_use_curriculum_training():
             logging.debug("reward queue len: {}, sum: {}".format(
                 str(len(self._q)), str(sum(self._q))))
         yield self._prepare_teacher_action(
-            reward=reward, sentence="failed", done=True)
+            reward=reward,
+            sentence="failed",
+            done=True,
+            success=success,
+            rewards=rewards)
 
     def _get_distraction_penalty(self, agent_loc, dot,
                                  prev_min_dist_to_distraction):
diff --git a/python/social_bot/teacher.py b/python/social_bot/teacher.py
index 7c84dd9b..cb28ab13 100644
--- a/python/social_bot/teacher.py
+++ b/python/social_bot/teacher.py
@@ -42,11 +42,13 @@ def __init__(self,
                  done=False,
                  is_idle=False,
                  success=False,
-                 goal_range=0.0):
+                 goal_range=0.0,
+                 rewards=None):
         """
         Args:
             done: end of an episode if true
             success: if the episode is successful or not
+            rewards: multi dim reward array
         """
         self.reward = reward
         self.sentence = sentence
@@ -54,6 +56,7 @@ def __init__(self,
         self.is_idle = is_idle
         self.success = success
         self.goal_range = float(goal_range)
+        self.rewards = rewards
 
 
 class TaskGroup(object):
@@ -343,6 +346,7 @@ def teach(self, agent_sentence):
             active_group_id = -1
             success = False
             goal_range = 0
+            rewards = None
             # run all groups in parallel
             for i, g in enumerate(self._task_groups):
                 teacher_action = g.teach(agent_sentence)
@@ -353,6 +357,11 @@ def teach(self, agent_sentence):
                 if teacher_action.goal_range > 0:
                     goal_range = teacher_action.goal_range
                 weight = g.get_current_reward_weight()
+                if teacher_action.rewards is not None:
+                    if rewards is None:
+                        rewards = weight * teacher_action.rewards
+                    else:
+                        rewards += weight * teacher_action.rewards
                 final_reward += weight * teacher_action.reward
                 if not final_sentence:
                     final_sentence = teacher_action.sentence
@@ -365,5 +374,6 @@ def teach(self, agent_sentence):
                 final_sentence,
                 done,
                 success=success,
-                goal_range=goal_range)
+                goal_range=goal_range,
+                rewards=rewards)
         return return_action

From dca08074335253695ff83b19e56dd9edb8cf7179 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Tue, 8 Sep 2020 23:19:08 -0700
Subject: [PATCH 18/53] adding a smaller ball to indicate target for pioneer,
 so it doesn't collide with the ball, and learn to stay close to the ball.

---
 .../models/target_ball/model.config           | 10 +++++
 .../social_bot/models/target_ball/model.sdf   | 43 +++++++++++++++++++
 2 files changed, 53 insertions(+)
 create mode 100644 python/social_bot/models/target_ball/model.config
 create mode 100644 python/social_bot/models/target_ball/model.sdf

diff --git a/python/social_bot/models/target_ball/model.config b/python/social_bot/models/target_ball/model.config
new file mode 100644
index 00000000..9aaa1ce2
--- /dev/null
+++ b/python/social_bot/models/target_ball/model.config
@@ -0,0 +1,10 @@
+<?xml version="1.0"?>
+<model>
+  <name>Target Ball</name>
+  <version>1.0</version>
+  <sdf version="1.5">model.sdf</sdf>
+
+  <description>
+    A ghost red ball.
+  </description>
+</model>
diff --git a/python/social_bot/models/target_ball/model.sdf b/python/social_bot/models/target_ball/model.sdf
new file mode 100644
index 00000000..7e5a742a
--- /dev/null
+++ b/python/social_bot/models/target_ball/model.sdf
@@ -0,0 +1,43 @@
+<?xml version="1.0" ?>
+<sdf version="1.5">
+  <model name="target_ball">
+    <static>true</static>
+    <pose>0 0 0 0 0 0</pose>
+    <link name="link">
+      <pose>0 0 0.527 0 0 0</pose>
+      <visual name="visual">
+        <geometry>
+          <sphere>
+            <radius>0.03</radius>
+          </sphere>
+        </geometry>
+        <material>
+          <ambient>0.6 0.1 0.1 1</ambient>
+          <diffuse>0.7 0.1 0.1 1</diffuse>
+          <specular>0 0 0 0</specular>
+          <emissive>0.0 0 0 1</emissive>
+        </material>
+      </visual>
+
+      <collision name="collision">
+        <geometry>
+          <sphere>
+            <radius>0</radius>
+          </sphere>
+        </geometry>
+        <surface>
+          <bounce>
+            <restitution_coefficient>0.0</restitution_coefficient>
+            <threshold>0</threshold>
+          </bounce>
+          <contact>
+            <ode>
+              <max_vel>0</max_vel>
+              <min_depth>0.1</min_depth>
+            </ode>
+          </contact>
+        </surface>
+      </collision>
+    </link>
+  </model>
+</sdf>

From 411a0961f76785b6bb285295eff1da0fd7657202 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Thu, 10 Sep 2020 10:33:38 -0700
Subject: [PATCH 19/53] fix missing distraction penalty

---
 python/social_bot/envs/gazebo_base.py |  1 +
 python/social_bot/tasks.py            | 11 +++++------
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/social_bot/envs/gazebo_base.py b/python/social_bot/envs/gazebo_base.py
index d55df8ac..4642a30d 100644
--- a/python/social_bot/envs/gazebo_base.py
+++ b/python/social_bot/envs/gazebo_base.py
@@ -138,6 +138,7 @@ def close(self):
         gazebo.close_without_model_base_fini()
         if self._rendering_process is not None:
             self._rendering_process.terminate()
+            self._rendering_process = None
 
     def insert_model(self, model, name=None, pose="0 0 0 0 0 0"):
         """
diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index 0f0242cd..cf103782 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -420,7 +420,7 @@ def _get_goal_dist(self, goal):
         dir = np.array([math.cos(agent_dir[2]), math.sin(agent_dir[2])])
         goal_dir = (goal_loc[0:2] - loc[0:2]) / dist
         dot = sum(dir * goal_dir)
-        return dist, dot
+        return dist, dot, loc
 
     def run(self):
         """ Start a teaching episode for this task. """
@@ -430,18 +430,17 @@ def run(self):
             loc, agent_dir = self._agent.get_pose()
             self._agent.set_pose((loc, (agent_dir[0], agent_dir[1],
                                         2 * math.pi * random.random())))
-        loc, agent_dir = self._agent.get_pose()
-        loc = np.array(loc)
+        a_loc, a_dir = self._get_agent_loc()
         self._random_move_objects()
         self.pick_goal()
         goal = self._world.get_model(self._goal_name)
-        self._move_goal(goal, loc, agent_dir)
+        self._move_goal(goal, a_loc, a_dir)
         steps_since_last_reward = 0
         prev_min_dist_to_distraction = 100
         rewards = None  # reward array in multi_dim_reward case
         while steps_since_last_reward < self._max_steps:
             steps_since_last_reward += 1
-            dist, dot = self._get_goal_dist(goal)
+            dist, dot, loc = self._get_goal_dist(goal)
             distraction_penalty, prev_min_dist_to_distraction = (
                 self._get_distraction_penalty(loc, dot,
                                               prev_min_dist_to_distraction))
@@ -511,7 +510,7 @@ def run(self):
                     done=done,
                     rewards=rewards)
         reward = -1.0
-        dist, dot = self._get_goal_dist(goal)
+        dist, dot, loc = self._get_goal_dist(goal)
         distraction_penalty, prev_min_dist_to_distraction = (
             self._get_distraction_penalty(loc, dot,
                                           prev_min_dist_to_distraction))

From 849b5eaee6093085036809e5b92b8560c9538761 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Thu, 10 Sep 2020 11:47:20 -0700
Subject: [PATCH 20/53] address comments from JT

---
 python/social_bot/envs/play_ground.py | 74 +----------------------
 python/social_bot/gazebo_agent.py     | 86 ++++++++++++++++++++++++++-
 2 files changed, 86 insertions(+), 74 deletions(-)

diff --git a/python/social_bot/envs/play_ground.py b/python/social_bot/envs/play_ground.py
index dab8441e..68ef5f0b 100644
--- a/python/social_bot/envs/play_ground.py
+++ b/python/social_bot/envs/play_ground.py
@@ -72,9 +72,6 @@ def __init__(self,
                  agent_type='pioneer2dx_noplugin',
                  world_name="play_ground.world",
                  tasks=[GoalTask],
-                 goal_conditioned=False,
-                 use_aux_achieved=False,
-                 xy_only_aux=False,
                  with_language=False,
                  with_agent_language=False,
                  use_image_observation=False,
@@ -97,18 +94,6 @@ def __init__(self,
             world_name (string): Select the world file, e.g., empty.world, play_ground.world,
                 grocery_ground.world
             tasks (list): a list of teacher.Task, e.g., GoalTask, KickingBallTask
-            goal_conditioned (bool): Turn on goal conditioned tasks.  Currently only GoalTask
-                with full state observation has this mode enabled.  The observation will
-                become an OrderedDict including ``achieved_goal``, ``desired_goal`` and
-                ``observation`` fields, instead of a flat np array.  ``achieved_goal`` will
-                contain agent's current 2-d (x, y) position and ``desired_goal`` will be
-                the goal object's 2-d position.  Reward becomes 0 for reaching goal, -1 for
-                any other step (assuming no distraction penalty).  Task termination remains
-                unchanged.
-            use_aux_achieved (bool): if True, pull out speed, pose dimensions into a separate
-                field: aux_achieved.  Only valid when goal_conditioned is True.
-            xy_only_aux (bool): exclude irrelevant dimensions (z-axis movements) from
-                aux_achieved field.
             with_language (bool): The observation will be a dict with an extra sentence
             with_agent_language (bool): Include agent sentence in action space.
             use_image_observation (bool): Use image, or use low-dimentional states as
@@ -143,9 +128,6 @@ def __init__(self,
         """
 
         self._action_cost = action_cost
-        self._goal_conditioned = goal_conditioned
-        self._use_aux_achieved = use_aux_achieved
-        self._xy_only_aux = xy_only_aux
         self._with_language = with_language
         self._seq_length = vocab_sequence_length
         self._with_agent_language = with_language and with_agent_language
@@ -208,63 +190,9 @@ def __init__(self,
         self.reset()
         self.observation_space = self._agent.get_observation_space(
             self._teacher)
-        if self._goal_conditioned:
-            assert agent_type == 'pioneer2dx_noplugin'
-            assert not with_language
-            assert not use_image_observation
-            assert not image_with_internal_states
-            assert len(tasks) == 1 and tasks[0] == GoalTask
-            # state observation
-            goal_shape = 2
-            goal_space = gym.spaces.Box(
-                low=-np.inf,
-                high=np.inf,
-                shape=(goal_shape, ),
-                dtype=np.float32)
-            d = OrderedDict(
-                observation=self.observation_space,
-                achieved_goal=goal_space,
-                desired_goal=goal_space)
-            if use_aux_achieved:
-                ob_shape = self.observation_space.shape[0]
-                aux_shape = 10
-                if self._xy_only_aux:
-                    aux_shape = 4
-                ob_space = gym.spaces.Box(
-                    low=-np.inf,
-                    high=np.inf,
-                    shape=(ob_shape - aux_shape - goal_shape * 2, ),
-                    dtype=np.float32)
-                aux_space = gym.spaces.Box(
-                    low=-np.inf,
-                    high=np.inf,
-                    shape=(aux_shape, ),
-                    dtype=np.float32)
-                d["observation"] = ob_space
-                d["aux_achieved"] = aux_space
-            self.observation_space = gym.spaces.Dict(**d)
 
     def _get_observation(self, teacher_sentence):
-        obs = self._agent.get_observation(self._teacher, teacher_sentence)
-        if self._goal_conditioned:
-            flat_obs = obs
-            obs = OrderedDict()
-            obs['observation'] = flat_obs
-            obs['achieved_goal'] = flat_obs[6:8]
-            obs['desired_goal'] = flat_obs[12:14]
-            if self._use_aux_achieved:
-                obs['observation'] = flat_obs[14:]
-                obs['aux_achieved'] = np.concatenate(
-                    (flat_obs[:6], flat_obs[8:12]), axis=0)
-                if self._xy_only_aux:
-                    # 2: z-speed, 3, 4: angular velocity, 5: yaw-vel, 6, 7: x, y, 8: z, 9, 10, 11: pose
-                    obs['observation'] = np.concatenate(
-                        (flat_obs[2:5], flat_obs[8:11], flat_obs[14:]), axis=0)
-                    obs['aux_achieved'] = np.concatenate(
-                        (flat_obs[0:2], np.expand_dims(flat_obs[5], 0),
-                         np.expand_dims(flat_obs[11], 0)),
-                        axis=0)
-        return obs
+        return self._agent.get_observation(self._teacher, teacher_sentence)
 
     def reset(self):
         """
diff --git a/python/social_bot/gazebo_agent.py b/python/social_bot/gazebo_agent.py
index 774e2932..7b7051a2 100644
--- a/python/social_bot/gazebo_agent.py
+++ b/python/social_bot/gazebo_agent.py
@@ -25,6 +25,7 @@
 from absl import logging
 import social_bot
 import social_bot.pygazebo as gazebo
+from social_bot.tasks import GoalTask
 
 
 @gin.configurable
@@ -43,6 +44,9 @@ def __init__(self,
                  with_language=False,
                  with_agent_language=False,
                  vocab_sequence_length=20,
+                 goal_conditioned=False,
+                 use_aux_achieved=False,
+                 xy_only_aux=False,
                  action_wrapper=None):
         """
         Args:
@@ -65,6 +69,19 @@ def __init__(self,
             with_language (bool): The observation will be a dict with an extra sentence
             with_agent_language (bool): Include language in agent's action space
             vocab_sequence_length (int): the length of encoded sequence if with_language
+            goal_conditioned (bool): Turn on goal conditioned tasks.  Currently only GoalTask
+                with full state observation has this mode enabled.  It makes the flat observation
+                array an OrderedDict including ``achieved_goal``, ``desired_goal`` and
+                ``observation`` fields.
+                ``achieved_goal`` will contain agent's current 2-d (x, y) position and
+                ``desired_goal`` will be the goal object's 2-d position.
+                Additionally, the flag with the same name in GoalTask makes reward 0 for
+                reaching goal and -1 for any other step (assuming no distraction penalty).
+                Task termination remains unchanged.
+            use_aux_achieved (bool): if True, pull out speed, pose dimensions into a separate
+                field: aux_achieved.  Only valid when goal_conditioned is True.
+            xy_only_aux (bool): exclude irrelevant dimensions (z-axis movements) from
+                aux_achieved field.
             action_wrapper (None|class): Some times primitive joints is not wanted, e.g., has
                 redundant dimensions or offset. If not None, this is used to transform the agent
                 actions. See ActionWrapper of gazebo_agent.py for example.
@@ -78,6 +95,9 @@ def __init__(self,
         self._with_agent_language = with_agent_language
         self._vocab_sequence_length = vocab_sequence_length
         self._sentence_space = None
+        self._goal_conditioned = goal_conditioned
+        self._use_aux_achieved = use_aux_achieved
+        self._xy_only_aux = xy_only_aux
 
         if config == None:
             # Load agent configurations
@@ -170,6 +190,25 @@ def get_observation(self, teacher, sentence_raw="hello"):
             obs = self.get_camera_observation()
         else:  # observation is pure low-dimentional states
             obs = teacher.get_task_specific_observation(self)
+            if self._goal_conditioned:
+                flat_obs = obs
+                obs = OrderedDict()
+                obs['observation'] = flat_obs
+                obs['achieved_goal'] = flat_obs[6:8]
+                obs['desired_goal'] = flat_obs[12:14]
+                if self._use_aux_achieved:
+                    obs['observation'] = flat_obs[14:]
+                    obs['aux_achieved'] = np.concatenate(
+                        (flat_obs[:6], flat_obs[8:12]), axis=0)
+                    if self._xy_only_aux:
+                        # 2: z-speed, 3, 4: angular velocity, 5: yaw-vel, 6, 7: x, y, 8: z, 9, 10, 11: pose
+                        obs['observation'] = np.concatenate(
+                            (flat_obs[2:5], flat_obs[8:11], flat_obs[14:]),
+                            axis=0)
+                        obs['aux_achieved'] = np.concatenate(
+                            (flat_obs[0:2], np.expand_dims(flat_obs[5], 0),
+                             np.expand_dims(flat_obs[11], 0)),
+                            axis=0)
         return obs
 
     def get_camera_observation(self):
@@ -233,7 +272,10 @@ def get_observation_space(self, teacher):
                 observations from teacher's taskgroups as a sample.
         """
         obs_sample = self.get_observation(teacher)
-        if self._with_language or self._image_with_internal_states:
+        if self._goal_conditioned:
+            observation_space = self._construct_goal_condi_space(
+                obs_sample, teacher)
+        elif self._with_language or self._image_with_internal_states:
             # observation is a dictionary
             observation_space = self._construct_dict_space(obs_sample)
         elif self._use_image_observation:
@@ -301,6 +343,48 @@ def _construct_dict_space(self, obs_sample):
         ob_space = gym.spaces.Dict(ob_space_dict)
         return ob_space
 
+    def _construct_goal_condi_space(self, obs_sample, teacher):
+        assert self.type == 'pioneer2dx_noplugin'
+        assert not self._with_language
+        assert not self._use_image_observation
+        assert not self._image_with_internal_states
+        tgs = teacher.get_task_groups()
+        assert len(tgs) == 1
+        tasks = tgs[0].get_tasks()
+        assert len(tasks) == 1 and isinstance(tasks[0], GoalTask), str(tasks)
+        # state observation
+        goal_shape = 2
+        goal_space = gym.spaces.Box(
+            low=-np.inf, high=np.inf, shape=(goal_shape, ), dtype=np.float32)
+        observation_space = gym.spaces.Box(
+            low=-np.inf,
+            high=np.inf,
+            shape=obs_sample['observation'].shape,
+            dtype=np.float32)
+        d = OrderedDict(
+            observation=observation_space,
+            achieved_goal=goal_space,
+            desired_goal=goal_space)
+        if self._use_aux_achieved:
+            ob_shape = observation_space.shape[0]
+            aux_shape = 10
+            if self._xy_only_aux:
+                aux_shape = 4
+            ob_space = gym.spaces.Box(
+                low=-np.inf,
+                high=np.inf,
+                shape=(ob_shape - aux_shape - goal_shape * 2, ),
+                dtype=np.float32)
+            aux_space = gym.spaces.Box(
+                low=-np.inf,
+                high=np.inf,
+                shape=(aux_shape, ),
+                dtype=np.float32)
+            d["observation"] = ob_space
+            d["aux_achieved"] = aux_space
+        observation_space = gym.spaces.Dict(**d)
+        return observation_space
+
     def setup_joints(self, agent, joints, agent_cfg):
         """  Setup the joints acrroding to agent configuration.
 

From 9d0dd00f401099e9f4dee907f2264d77f47273d4 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Thu, 10 Sep 2020 11:52:41 -0700
Subject: [PATCH 21/53] missed one fix

---
 python/social_bot/tasks.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index cf103782..473d6fc4 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -732,7 +732,8 @@ def task_specific_observation(self, agent):
                         x, y, yaw)
                 else:
                     rotated_x, rotated_y = x, y
-                if self._egocentric_perception_range > 0:
+                if (self._use_egocentric_states
+                        and self._egocentric_perception_range > 0):
                     dist = math.sqrt(rotated_x * rotated_x +
                                      rotated_y * rotated_y)
                     rotated_x /= dist

From 92db4a099b8dadccc12e7159c4d0ebba8b4625a6 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Tue, 15 Sep 2020 15:31:49 -0700
Subject: [PATCH 22/53] address comments, move goal_conditioned obs generation
 to GoalTask

---
 pygazebo/pygazebo.cc              | 13 ------
 python/social_bot/gazebo_agent.py | 73 +++++++------------------------
 python/social_bot/tasks.py        | 28 ++++++++++++
 3 files changed, 44 insertions(+), 70 deletions(-)

diff --git a/pygazebo/pygazebo.cc b/pygazebo/pygazebo.cc
index c9ac1548..e8712282 100644
--- a/pygazebo/pygazebo.cc
+++ b/pygazebo/pygazebo.cc
@@ -507,19 +507,6 @@ void CloseWithoutModelbaseFini() {
   gazebo_sensor_initialized = false;
 }
 
-void CloseWithoutFini() {
-  // this causes segmentation fault when run.
-  gazebo::physics::stop_worlds();
-  gazebo::sensors::stop();
-  gazebo::util::LogRecord::Instance()->Stop();
-  gazebo::transport::stop();
-
-  gazebo::transport::fini();
-
-  gazebo_initialized = false;
-  gazebo_sensor_initialized = false;
-}
-
 void Close() {
   gazebo::shutdown();
   gazebo_initialized = false;
diff --git a/python/social_bot/gazebo_agent.py b/python/social_bot/gazebo_agent.py
index 7b7051a2..062da801 100644
--- a/python/social_bot/gazebo_agent.py
+++ b/python/social_bot/gazebo_agent.py
@@ -45,8 +45,6 @@ def __init__(self,
                  with_agent_language=False,
                  vocab_sequence_length=20,
                  goal_conditioned=False,
-                 use_aux_achieved=False,
-                 xy_only_aux=False,
                  action_wrapper=None):
         """
         Args:
@@ -78,10 +76,6 @@ def __init__(self,
                 Additionally, the flag with the same name in GoalTask makes reward 0 for
                 reaching goal and -1 for any other step (assuming no distraction penalty).
                 Task termination remains unchanged.
-            use_aux_achieved (bool): if True, pull out speed, pose dimensions into a separate
-                field: aux_achieved.  Only valid when goal_conditioned is True.
-            xy_only_aux (bool): exclude irrelevant dimensions (z-axis movements) from
-                aux_achieved field.
             action_wrapper (None|class): Some times primitive joints is not wanted, e.g., has
                 redundant dimensions or offset. If not None, this is used to transform the agent
                 actions. See ActionWrapper of gazebo_agent.py for example.
@@ -96,8 +90,6 @@ def __init__(self,
         self._vocab_sequence_length = vocab_sequence_length
         self._sentence_space = None
         self._goal_conditioned = goal_conditioned
-        self._use_aux_achieved = use_aux_achieved
-        self._xy_only_aux = xy_only_aux
 
         if config == None:
             # Load agent configurations
@@ -188,27 +180,15 @@ def get_observation(self, teacher, sentence_raw="hello"):
             obs = self._create_observation_dict(teacher, sentence_raw)
         elif self._use_image_observation:  # observation is pure image
             obs = self.get_camera_observation()
+        elif self._goal_conditioned:
+            obs = self._generate_goal_conditioned_obs(teacher)
         else:  # observation is pure low-dimentional states
             obs = teacher.get_task_specific_observation(self)
-            if self._goal_conditioned:
-                flat_obs = obs
-                obs = OrderedDict()
-                obs['observation'] = flat_obs
-                obs['achieved_goal'] = flat_obs[6:8]
-                obs['desired_goal'] = flat_obs[12:14]
-                if self._use_aux_achieved:
-                    obs['observation'] = flat_obs[14:]
-                    obs['aux_achieved'] = np.concatenate(
-                        (flat_obs[:6], flat_obs[8:12]), axis=0)
-                    if self._xy_only_aux:
-                        # 2: z-speed, 3, 4: angular velocity, 5: yaw-vel, 6, 7: x, y, 8: z, 9, 10, 11: pose
-                        obs['observation'] = np.concatenate(
-                            (flat_obs[2:5], flat_obs[8:11], flat_obs[14:]),
-                            axis=0)
-                        obs['aux_achieved'] = np.concatenate(
-                            (flat_obs[0:2], np.expand_dims(flat_obs[5], 0),
-                             np.expand_dims(flat_obs[11], 0)),
-                            axis=0)
+        return obs
+
+    def _generate_goal_conditioned_obs(self, teacher):
+        tasks = teacher.get_task_groups()[0].get_tasks()
+        obs = tasks[0].generate_goal_conditioned_obs(self)
         return obs
 
     def get_camera_observation(self):
@@ -343,7 +323,7 @@ def _construct_dict_space(self, obs_sample):
         ob_space = gym.spaces.Dict(ob_space_dict)
         return ob_space
 
-    def _construct_goal_condi_space(self, obs_sample, teacher):
+    def _assert_goal_conditioned(self, teacher):
         assert self.type == 'pioneer2dx_noplugin'
         assert not self._with_language
         assert not self._use_image_observation
@@ -352,38 +332,17 @@ def _construct_goal_condi_space(self, obs_sample, teacher):
         assert len(tgs) == 1
         tasks = tgs[0].get_tasks()
         assert len(tasks) == 1 and isinstance(tasks[0], GoalTask), str(tasks)
-        # state observation
-        goal_shape = 2
-        goal_space = gym.spaces.Box(
-            low=-np.inf, high=np.inf, shape=(goal_shape, ), dtype=np.float32)
-        observation_space = gym.spaces.Box(
-            low=-np.inf,
-            high=np.inf,
-            shape=obs_sample['observation'].shape,
-            dtype=np.float32)
-        d = OrderedDict(
-            observation=observation_space,
-            achieved_goal=goal_space,
-            desired_goal=goal_space)
-        if self._use_aux_achieved:
-            ob_shape = observation_space.shape[0]
-            aux_shape = 10
-            if self._xy_only_aux:
-                aux_shape = 4
-            ob_space = gym.spaces.Box(
-                low=-np.inf,
-                high=np.inf,
-                shape=(ob_shape - aux_shape - goal_shape * 2, ),
-                dtype=np.float32)
-            aux_space = gym.spaces.Box(
+
+    def _construct_goal_condi_space(self, obs_sample, teacher):
+        self._assert_goal_conditioned(teacher)
+        d = OrderedDict()
+        for k, v in obs_sample.items():
+            d[k] = gym.spaces.Box(
                 low=-np.inf,
                 high=np.inf,
-                shape=(aux_shape, ),
+                shape=(v.shape[0], ),
                 dtype=np.float32)
-            d["observation"] = ob_space
-            d["aux_achieved"] = aux_space
-        observation_space = gym.spaces.Dict(**d)
-        return observation_space
+        return gym.spaces.Dict(**d)
 
     def setup_joints(self, agent, joints, agent_cfg):
         """  Setup the joints acrroding to agent configuration.
diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index 473d6fc4..ef54d172 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -178,6 +178,8 @@ def __init__(self,
                      'coke_can', 'table', 'car_wheel', 'plastic_cup', 'beer'
                  ],
                  goal_conditioned=False,
+                 use_aux_achieved=False,
+                 xy_only_aux=False,
                  multi_dim_reward=False,
                  end_on_hitting_distraction=False,
                  end_episode_after_success=False,
@@ -214,6 +216,10 @@ def __init__(self,
             goal_name (string): name of the goal in the world
             distraction_list (list of string): a list of model. the model shoud be in gazebo database
             goal_conditioned (bool): if True, each step has -1 reward, unless at goal state, which gives 0.
+            use_aux_achieved (bool): if True, pull out speed, pose dimensions into a separate
+                field: aux_achieved.  Only valid when goal_conditioned is True.
+            xy_only_aux (bool): exclude irrelevant dimensions (z-axis movements) from
+                aux_achieved field.
             multi_dim_reward (bool): if True, separate goal reward and distraction penalty into two dimensions.
             end_episode_after_success (bool): if True, the episode will end once the goal is reached. A True value of this
                 flag will overwrite the effects of flags ``switch_goal_within_episode`` and ``move_goal_during_episode``.
@@ -269,6 +275,8 @@ def __init__(self,
             env=env, max_steps=max_steps, reward_weight=reward_weight)
         self._goal_name = goal_name
         self._goal_conditioned = goal_conditioned
+        self._use_aux_achieved = use_aux_achieved
+        self._xy_only_aux = xy_only_aux
         self._multi_dim_reward = multi_dim_reward
         self.end_on_hitting_distraction = end_on_hitting_distraction
         self._end_episode_after_success = end_episode_after_success
@@ -686,6 +694,26 @@ def set_goal_name(self, goal_name):
         logging.debug('Setting Goal to %s', goal_name)
         self._goal_name = goal_name
 
+    def generate_goal_conditioned_obs(self, agent):
+        flat_obs = self.task_specific_observation(agent)
+        obs = OrderedDict()
+        obs['observation'] = flat_obs
+        obs['achieved_goal'] = flat_obs[6:8]
+        obs['desired_goal'] = flat_obs[12:14]
+        if self._use_aux_achieved:
+            obs['observation'] = flat_obs[14:]
+            obs['aux_achieved'] = np.concatenate(
+                (flat_obs[:6], flat_obs[8:12]), axis=0)
+            if self._xy_only_aux:
+                # 2: z-speed, 3, 4: angular velocity, 5: yaw-vel, 6, 7: x, y, 8: z, 9, 10, 11: pose
+                obs['observation'] = np.concatenate(
+                    (flat_obs[2:5], flat_obs[8:11], flat_obs[14:]), axis=0)
+                obs['aux_achieved'] = np.concatenate(
+                    (flat_obs[0:2], np.expand_dims(flat_obs[5], 0),
+                     np.expand_dims(flat_obs[11], 0)),
+                    axis=0)
+        return obs
+
     def task_specific_observation(self, agent):
         """
         Args:

From dba09315cb651c9feeadc489cfea71b6f1b70a90 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Tue, 15 Sep 2020 16:01:51 -0700
Subject: [PATCH 23/53] use more readable states to generate goal conditioned
 output

---
 python/social_bot/tasks.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index ef54d172..b79233fa 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -697,20 +697,26 @@ def set_goal_name(self, goal_name):
     def generate_goal_conditioned_obs(self, agent):
         flat_obs = self.task_specific_observation(agent)
         obs = OrderedDict()
+        agent_pose = np.array(agent.get_pose()).flatten()
+        agent_vel = np.array(agent.get_velocities()).flatten()
+        goal = self._world.get_model(self._goal_name)
+        goal_pose = np.array(goal.get_pose()[0]).flatten()
         obs['observation'] = flat_obs
-        obs['achieved_goal'] = flat_obs[6:8]
-        obs['desired_goal'] = flat_obs[12:14]
+        obs['achieved_goal'] = agent_pose[0:2]
+        obs['desired_goal'] = goal_pose[0:2]
         if self._use_aux_achieved:
+            # distraction objects' x, y coordinates
             obs['observation'] = flat_obs[14:]
-            obs['aux_achieved'] = np.concatenate(
-                (flat_obs[:6], flat_obs[8:12]), axis=0)
+            obs['aux_achieved'] = np.concatenate((agent_vel, agent_pose[2:]),
+                                                 axis=0)
             if self._xy_only_aux:
-                # 2: z-speed, 3, 4: angular velocity, 5: yaw-vel, 6, 7: x, y, 8: z, 9, 10, 11: pose
+                # agent speed: 2: z-speed; 3, 4: angular velocities; 5: yaw-vel,
+                # agent pose: 2: z; 3, 4, 5: roll pitch yaw.
                 obs['observation'] = np.concatenate(
-                    (flat_obs[2:5], flat_obs[8:11], flat_obs[14:]), axis=0)
+                    (agent_vel[2:5], agent_pose[2:5], flat_obs[14:]), axis=0)
                 obs['aux_achieved'] = np.concatenate(
-                    (flat_obs[0:2], np.expand_dims(flat_obs[5], 0),
-                     np.expand_dims(flat_obs[11], 0)),
+                    (agent_vel[0:2], np.expand_dims(agent_vel[5], 0),
+                     np.expand_dims(agent_pose[5], 0)),
                     axis=0)
         return obs
 

From b7d0ab4e9cdea8c3902775193d1e285d33bdfb23 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Wed, 16 Sep 2020 12:59:05 -0700
Subject: [PATCH 24/53] address comments from JT.

---
 python/social_bot/gazebo_agent.py |  2 +-
 python/social_bot/tasks.py        | 74 ++++++++++++++-----------------
 2 files changed, 34 insertions(+), 42 deletions(-)

diff --git a/python/social_bot/gazebo_agent.py b/python/social_bot/gazebo_agent.py
index 062da801..88cb8eca 100644
--- a/python/social_bot/gazebo_agent.py
+++ b/python/social_bot/gazebo_agent.py
@@ -253,6 +253,7 @@ def get_observation_space(self, teacher):
         """
         obs_sample = self.get_observation(teacher)
         if self._goal_conditioned:
+            self._assert_goal_conditioned(teacher)
             observation_space = self._construct_goal_condi_space(
                 obs_sample, teacher)
         elif self._with_language or self._image_with_internal_states:
@@ -334,7 +335,6 @@ def _assert_goal_conditioned(self, teacher):
         assert len(tasks) == 1 and isinstance(tasks[0], GoalTask), str(tasks)
 
     def _construct_goal_condi_space(self, obs_sample, teacher):
-        self._assert_goal_conditioned(teacher)
         d = OrderedDict()
         for k, v in obs_sample.items():
             d[k] = gym.spaces.Box(
diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index b79233fa..4acf9e57 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -745,49 +745,41 @@ def task_specific_observation(self, agent):
                 pose = np.concatenate((pose, obj_pos), axis=0)
 
         agent_pose = np.array(agent.get_pose()).flatten()
-        if self._use_full_states or self._use_egocentric_states:
-            yaw = agent_pose[5]
-            # adds egocentric velocity input
-            vx, vy, vz, a1, a2, a3 = np.array(agent.get_velocities()).flatten()
+        yaw = agent_pose[5]
+        # adds egocentric velocity input
+        vx, vy, vz, a1, a2, a3 = np.array(agent.get_velocities()).flatten()
+        if self._use_egocentric_states:
+            rvx, rvy = agent.get_egocentric_cord_2d(vx, vy, yaw)
+        else:
+            rvx, rvy = vx, vy
+        obs = [rvx, rvy, vz, a1, a2, a3] + list(agent_pose)
+        # adds objects' (goal's as well as distractions') egocentric
+        # coordinates to observation
+        while len(pose) > 1:
+            x = pose[0]
+            y = pose[1]
             if self._use_egocentric_states:
-                rvx, rvy = agent.get_egocentric_cord_2d(vx, vy, yaw)
+                x = pose[0] - agent_pose[0]
+                y = pose[1] - agent_pose[1]
+                rotated_x, rotated_y = agent.get_egocentric_cord_2d(x, y, yaw)
             else:
-                rvx, rvy = vx, vy
-            obs = [rvx, rvy, vz, a1, a2, a3] + list(agent_pose)
-            # adds objects' (goal's as well as distractions') egocentric
-            # coordinates to observation
-            while len(pose) > 1:
-                x = pose[0]
-                y = pose[1]
-                if self._use_egocentric_states:
-                    x = pose[0] - agent_pose[0]
-                    y = pose[1] - agent_pose[1]
-                    rotated_x, rotated_y = agent.get_egocentric_cord_2d(
-                        x, y, yaw)
-                else:
-                    rotated_x, rotated_y = x, y
-                if (self._use_egocentric_states
-                        and self._egocentric_perception_range > 0):
-                    dist = math.sqrt(rotated_x * rotated_x +
-                                     rotated_y * rotated_y)
-                    rotated_x /= dist
-                    rotated_y /= dist
-                    magnitude = 1. / dist
-                    if rotated_x < np.cos(
-                            self._egocentric_perception_range / 180. * np.pi):
-                        rotated_x = 0.
-                        rotated_y = 0.
-                        magnitude = 0.
-                    obs.extend([rotated_x, rotated_y, magnitude])
-                else:
-                    obs.extend([rotated_x, rotated_y])
-                pose = pose[3:]
-            obs = np.array(obs)
-        else:
-            agent_vel = np.array(agent.get_velocities()).flatten()
-            joints_states = agent.get_internal_states()
-            obs = np.concatenate((pose, agent_pose, agent_vel, joints_states),
-                                 axis=0)
+                rotated_x, rotated_y = x, y
+            if (self._use_egocentric_states
+                    and self._egocentric_perception_range > 0):
+                dist = math.sqrt(rotated_x * rotated_x + rotated_y * rotated_y)
+                rotated_x /= dist
+                rotated_y /= dist
+                magnitude = 1. / dist
+                if rotated_x < np.cos(
+                        self._egocentric_perception_range / 180. * np.pi):
+                    rotated_x = 0.
+                    rotated_y = 0.
+                    magnitude = 0.
+                obs.extend([rotated_x, rotated_y, magnitude])
+            else:
+                obs.extend([rotated_x, rotated_y])
+            pose = pose[3:]
+        obs = np.array(obs)
 
         return obs
 

From da45b969e51521297ad05587eb20414b1c1a9f21 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Wed, 16 Sep 2020 23:52:57 -0700
Subject: [PATCH 25/53] remove unused var use_full_states

---
 python/social_bot/tasks.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index 4acf9e57..4b07f22f 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -206,7 +206,6 @@ def __init__(self,
                  move_goal_during_episode=True,
                  success_with_angle_requirement=True,
                  additional_observation_list=[],
-                 use_full_states=False,
                  use_egocentric_states=False,
                  egocentric_perception_range=0):
         """
@@ -260,7 +259,6 @@ def __init__(self,
             move_goal_during_episode (bool): if True, the goal will be moved during episode, when it has been achieved
             success_with_angle_requirement: if True then calculate the reward considering the angular requirement
             additional_observation_list: a list of additonal objects to be added
-            use_full_states (bool): For non-image observation, use full states of the world
             use_egocentric_states (bool): For the non-image observation case, use the states transformed to
                 egocentric coordinate, e.g., agent's egocentric distance and direction to goal
             egocentric_perception_range (float): the max range in degree to limit the agent's observation.
@@ -317,7 +315,6 @@ def __init__(self,
                       self._max_play_ground_size)))
         self._pos_list.remove((0, 0))
         self._polar_coord = polar_coord
-        self._use_full_states = use_full_states
         self._use_egocentric_states = use_egocentric_states
         self._egocentric_perception_range = egocentric_perception_range
         if self.should_use_curriculum_training():
@@ -339,13 +336,13 @@ def __init__(self,
             self._random_range = random_range
         self._goal_dist = 0.
         obs_format = "image"
-        if use_full_states or use_egocentric_states:
-            obs_format = "full_state"
         obs_relative = "ego"
-        if use_full_states and not use_egocentric_states:
+        if use_egocentric_states:
+            obs_format = "full_state"
+        else:
             obs_relative = "absolute"
         logging.info("Observations: {}, {}.".format(obs_format, obs_relative))
-        if use_full_states and not use_egocentric_states:
+        if not use_egocentric_states:
             logging.info(
                 "Dims: 0-5: agent's velocity and angular " +
                 "velocity, 6-11: agent's position and pose, 12-13: goal x, y" +

From 19ce0c7d1bdf9797427fdf58d2fe33ad892ad221 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Mon, 5 Oct 2020 10:51:01 -0700
Subject: [PATCH 26/53] random agent start position in GoalTask.  Fix
 initial_dist and prev_dist inaccurate due to distraction objects also counted
 in.

---
 .../social_bot/models/target_ball/model.sdf   |  2 +-
 python/social_bot/tasks.py                    | 22 ++++++++++++++-----
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/python/social_bot/models/target_ball/model.sdf b/python/social_bot/models/target_ball/model.sdf
index 7e5a742a..0e80a542 100644
--- a/python/social_bot/models/target_ball/model.sdf
+++ b/python/social_bot/models/target_ball/model.sdf
@@ -8,7 +8,7 @@
       <visual name="visual">
         <geometry>
           <sphere>
-            <radius>0.03</radius>
+            <radius>0.1</radius>
           </sphere>
         </geometry>
         <material>
diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index 4b07f22f..e2eb01ad 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -188,6 +188,7 @@ def __init__(self,
                  fail_distance_thresh=2.0,
                  distraction_penalty_distance_thresh=0,
                  distraction_penalty=0.5,
+                 random_agent_position=False,
                  random_agent_orientation=False,
                  sparse_reward=True,
                  random_range=5.0,
@@ -233,6 +234,7 @@ def __init__(self,
                 facing it is considered hitting a distraction)
             distraction_penalty (float): positive float of how much to penalize getting too close to
                 distraction objects
+            random_agent_position (bool): whether randomize the position of the agent at beginning of episode.
             random_agent_orientation (bool): whether randomize the orientation (yaw) of the agent at the beginning of an
                 episode.
             sparse_reward (bool): if true, the reward is -1/0/1, otherwise the 0 case will be replaced
@@ -286,6 +288,7 @@ def __init__(self,
             assert distraction_penalty_distance_thresh < success_distance_thresh
         self._distraction_penalty = distraction_penalty
         self._sparse_reward = sparse_reward
+        self._random_agent_position = random_agent_position
         self._random_agent_orientation = random_agent_orientation
         self._use_curriculum_training = use_curriculum_training
         self._curriculum_distractions = curriculum_distractions
@@ -431,10 +434,16 @@ def run(self):
         """ Start a teaching episode for this task. """
         agent_sentence = yield
         self._agent.reset()
-        if self._random_agent_orientation:
+        if self._random_agent_orientation or self._random_agent_position:
             loc, agent_dir = self._agent.get_pose()
-            self._agent.set_pose((loc, (agent_dir[0], agent_dir[1],
-                                        2 * math.pi * random.random())))
+            if self._random_agent_position:
+                loc = (self._max_play_ground_size * (1 - 2 * random.random()),
+                       self._max_play_ground_size * (1 - 2 * random.random()),
+                       loc[2])
+            if self._random_agent_orientation:
+                agent_dir = (agent_dir[0], agent_dir[1],
+                             2 * math.pi * random.random())
+            self._agent.set_pose((loc, agent_dir))
         a_loc, a_dir = self._get_agent_loc()
         self._random_move_objects()
         self.pick_goal()
@@ -639,7 +648,8 @@ def _move_obj(self,
                 loc = np.asarray((random.random() * range - range / 2,
                                   random.random() * range - range / 2, 0))
 
-            self._initial_dist = np.linalg.norm(loc - agent_loc)
+            if is_goal:
+                self._initial_dist = np.linalg.norm(loc - agent_loc)
             satisfied = True
             if (abs(loc[0]) > self._max_play_ground_size or abs(loc[1]) >
                     self._max_play_ground_size):  # not within walls
@@ -658,7 +668,8 @@ def _move_obj(self,
                             str(agent_loc), str(range),
                             str(self._max_play_ground_size)))
                 break
-        self._prev_dist = self._initial_dist
+        if is_goal:
+            self._prev_dist = self._initial_dist
         obj.reset()
         obj.set_pose((loc, (0, 0, 0)))
         return loc, dist
@@ -704,6 +715,7 @@ def generate_goal_conditioned_obs(self, agent):
         if self._use_aux_achieved:
             # distraction objects' x, y coordinates
             obs['observation'] = flat_obs[14:]
+            # 0, 1, 2: vel; 3, 4, 5: angular vel; 6: z position; 7, 8, 9: roll pitch yaw
             obs['aux_achieved'] = np.concatenate((agent_vel, agent_pose[2:]),
                                                  axis=0)
             if self._xy_only_aux:

From 47b2ee9765033763acb7150bdab0125bd2da83c2 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Fri, 30 Oct 2020 22:14:40 -0700
Subject: [PATCH 27/53] more debug printing of agent states at episode end

---
 python/social_bot/tasks.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index e2eb01ad..d1d54284 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -428,7 +428,7 @@ def _get_goal_dist(self, goal):
         dir = np.array([math.cos(agent_dir[2]), math.sin(agent_dir[2])])
         goal_dir = (goal_loc[0:2] - loc[0:2]) / dist
         dot = sum(dir * goal_dir)
-        return dist, dot, loc
+        return dist, dot, loc, agent_dir
 
     def run(self):
         """ Start a teaching episode for this task. """
@@ -454,7 +454,7 @@ def run(self):
         rewards = None  # reward array in multi_dim_reward case
         while steps_since_last_reward < self._max_steps:
             steps_since_last_reward += 1
-            dist, dot, loc = self._get_goal_dist(goal)
+            dist, dot, loc, agent_dir = self._get_goal_dist(goal)
             distraction_penalty, prev_min_dist_to_distraction = (
                 self._get_distraction_penalty(loc, dot,
                                               prev_min_dist_to_distraction))
@@ -524,7 +524,7 @@ def run(self):
                     done=done,
                     rewards=rewards)
         reward = -1.0
-        dist, dot, loc = self._get_goal_dist(goal)
+        dist, dot, loc, agent_dir = self._get_goal_dist(goal)
         distraction_penalty, prev_min_dist_to_distraction = (
             self._get_distraction_penalty(loc, dot,
                                           prev_min_dist_to_distraction))
@@ -546,7 +546,16 @@ def run(self):
             self._push_reward_queue(0)
             logging.debug("took more than {} steps".format(
                 str(self._max_steps)))
-        logging.debug("yielding reward: {}".format(str(reward)))
+        agent_vel = np.array(self._agent.get_velocities()).flatten()
+
+        def _str(arr):
+            res = ["["]
+            res.extend(["{:.2f}".format(e) for e in arr])
+            res.append("]")
+            return " ".join(res)
+
+        logging.debug("yielding reward: %s at\nloc %s pose %s vel %s",
+                      str(reward), _str(loc), _str(agent_dir), _str(agent_vel))
         if self.should_use_curriculum_training():
             logging.debug("reward queue len: {}, sum: {}".format(
                 str(len(self._q)), str(sum(self._q))))

From 200368d0049c86d96d62af0e297dc36fbdc03d9f Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Fri, 6 Nov 2020 14:19:16 -0800
Subject: [PATCH 28/53] add short walls to enclose playground

---
 python/social_bot/worlds/play_ground.world | 31 ++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/python/social_bot/worlds/play_ground.world b/python/social_bot/worlds/play_ground.world
index 1c80a0a8..684849e3 100644
--- a/python/social_bot/worlds/play_ground.world
+++ b/python/social_bot/worlds/play_ground.world
@@ -40,6 +40,13 @@
       </include>
       <pose frame=''>-2 5.7 0 0 -0 0</pose>
     </model>
+    <model name='grey_wall_left2'>
+      <static>1</static>
+      <include>
+        <uri>model://grey_wall</uri>
+      </include>
+      <pose frame=''>2 5.7 0 0 -0 0</pose>
+    </model>
     <model name='grey_wall_right'>
       <static>1</static>
       <include>
@@ -47,6 +54,13 @@
       </include>
       <pose frame=''>-2 -5.7 0 0 -0 0</pose>
     </model>
+    <model name='grey_wall_right2'>
+      <static>1</static>
+      <include>
+        <uri>model://grey_wall</uri>
+      </include>
+      <pose frame=''>2 -5.7 0 0 -0 0</pose>
+    </model>
     <model name='grey_wall_behind0'>
       <static>1</static>
       <include>
@@ -61,5 +75,22 @@
       </include>
       <pose frame=''>-5.7 -2 0 0 -0 1.5708</pose>
     </model>
+    <model name="east_wall">
+      <static>1</static>
+      <pose>5.7 0 0.1 0 0 0</pose>
+      <link name ="link">
+        <pose>0 0 0 0 0 0</pose>
+        <collision name ="collision">
+          <geometry>
+            <box><size>0.2 11.4 0.2</size></box>
+          </geometry>
+        </collision>
+        <visual name ='visual'>
+          <geometry>
+            <box><size>0.2 11.4 0.2</size></box>
+          </geometry>
+        </visual>
+      </link>
+    </model>
   </world>
 </sdf>

From 593135abead677af29956f7b495fb516642691ac Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Wed, 11 Nov 2020 14:50:21 -0800
Subject: [PATCH 29/53] allow chaining goaltask together with probability, also
 avoids resetting goals too close to distractions.

---
 python/social_bot/tasks.py | 59 +++++++++++++++++++++++++++-----------
 1 file changed, 43 insertions(+), 16 deletions(-)

diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index d1d54284..dc872770 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -184,6 +184,7 @@ def __init__(self,
                  end_on_hitting_distraction=False,
                  end_episode_after_success=False,
                  reset_time_limit_on_success=True,
+                 chain_task_rate=0,
                  success_distance_thresh=0.5,
                  fail_distance_thresh=2.0,
                  distraction_penalty_distance_thresh=0,
@@ -226,6 +227,10 @@ def __init__(self,
             end_on_hitting_distraction (bool): whether to end episode on hitting distraction
             reset_time_limit_on_success (bool): if not ending after success, if hit success before time limit,
                 reset clock to 0.
+            chain_task_rate (float): if positive, with this much probability, the current task will be chained together
+                with the next task: after one tasks finishes, episode doesn't end, but keeps onto the second goal/task.
+                If the first task isn't achieved within max_steps, episode ends.  When the first task is achieved,
+                step_type is MID and no goal reward is given.  Reward is given only after achieving all chained tasks.
             success_distance_thresh (float): the goal is reached if it's within this distance to the agent
             fail_distance_thresh (float): if the agent moves away from the goal more than this distance,
                 it's considered a failure and is given reward -1
@@ -281,6 +286,7 @@ def __init__(self,
         self.end_on_hitting_distraction = end_on_hitting_distraction
         self._end_episode_after_success = end_episode_after_success
         self._reset_time_limit_on_success = reset_time_limit_on_success
+        self._chain_task_rate = chain_task_rate
         self._success_distance_thresh = success_distance_thresh
         self._fail_distance_thresh = fail_distance_thresh
         self._distraction_penalty_distance_thresh = distraction_penalty_distance_thresh
@@ -444,11 +450,24 @@ def run(self):
                 agent_dir = (agent_dir[0], agent_dir[1],
                              2 * math.pi * random.random())
             self._agent.set_pose((loc, agent_dir))
-        a_loc, a_dir = self._get_agent_loc()
         self._random_move_objects()
         self.pick_goal()
         goal = self._world.get_model(self._goal_name)
-        self._move_goal(goal, a_loc, a_dir)
+        done = False
+        gen = self._run_one_goal(goal)
+        while not done:
+            action = next(gen)
+            if (action.done and self._chain_task_rate > 0
+                    and random.random() < self._chain_task_rate):
+                action.done = False
+                gen = self._run_one_goal(goal, move_distractions=False)
+            done = action.done
+            yield action
+
+    def _run_one_goal(self, goal, move_distractions=True):
+        """Generator function of task feedback."""
+        a_loc, a_dir = self._get_agent_loc()
+        self._move_goal(goal, a_loc, a_dir, move_distractions)
         steps_since_last_reward = 0
         prev_min_dist_to_distraction = 100
         rewards = None  # reward array in multi_dim_reward case
@@ -596,11 +615,18 @@ def _get_distraction_penalty(self, agent_loc, dot,
             prev_min_dist_to_distraction = curr_min_dist
         return distraction_penalty, prev_min_dist_to_distraction
 
-    def _move_goal(self, goal, agent_loc, agent_dir):
+    def _move_goal(self, goal, agent_loc, agent_dir, move_distractions=True):
         """
         Move goal as well as all distraction objects to a random location.
         """
         avoid_locations = [agent_loc]
+        if not move_distractions:
+            for item in self._distraction_list:
+                if item is not self._goal_name:
+                    obj = self._world.get_model(item)
+                    if obj:
+                        distraction_loc = obj.get_pose()[0]
+                        avoid_locations.append(distraction_loc)
         loc, dist = self._move_obj(
             obj=goal,
             agent_loc=agent_loc,
@@ -610,19 +636,20 @@ def _move_goal(self, goal, agent_loc, agent_dir):
         self._goal_dist += dist
         avoid_locations.append(loc)
         distractions = OrderedDict()
-        for item in self._distraction_list:
-            if item is not self._goal_name:
-                distractions[item] = 1
-        if len(distractions) and self._curriculum_distractions:
-            for item, _ in distractions.items():
-                distraction = self._world.get_agent(item)
-                loc, _ = self._move_obj(
-                    obj=distraction,
-                    agent_loc=agent_loc,
-                    agent_dir=agent_dir,
-                    is_goal=False,
-                    avoid_locations=avoid_locations)
-                avoid_locations.append(loc)
+        if move_distractions:
+            for item in self._distraction_list:
+                if item is not self._goal_name:
+                    distractions[item] = 1
+            if len(distractions) and self._curriculum_distractions:
+                for item, _ in distractions.items():
+                    distraction = self._world.get_agent(item)
+                    loc, _ = self._move_obj(
+                        obj=distraction,
+                        agent_loc=agent_loc,
+                        agent_dir=agent_dir,
+                        is_goal=False,
+                        avoid_locations=avoid_locations)
+                    avoid_locations.append(loc)
 
     def _move_obj(self,
                   obj,

From 9734ffb7e08ebb564f1ae57a5998d273f582631d Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Tue, 12 Jan 2021 21:58:18 -0800
Subject: [PATCH 30/53] allow setting min_distance of goal to agent's location,
 so now goal range is uniformly random within random_range (max) and the min.

---
 python/social_bot/tasks.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index dc872770..0eb7bfd1 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -193,6 +193,7 @@ def __init__(self,
                  random_agent_orientation=False,
                  sparse_reward=True,
                  random_range=5.0,
+                 min_distance=0,
                  polar_coord=True,
                  random_goal=False,
                  use_curriculum_training=False,
@@ -245,6 +246,8 @@ def __init__(self,
             sparse_reward (bool): if true, the reward is -1/0/1, otherwise the 0 case will be replaced
                 with normalized distance the agent get closer to goal.
             random_range (float): the goal's random position range
+            min_distance (float): the goal must be this minimum distance away from avoided locations.  If this is smaller
+                than the success_distance_thresh, then success_distance_thresh is used instead.
             polar_coord (bool): use cartesian coordinates in random_range, otherwise, use polar coord.
             random_goal (bool): if True, teacher will randomly select goal from the object list each episode
             use_curriculum_training (bool): when true, use curriculum in goal task training
@@ -343,6 +346,9 @@ def __init__(self,
                 self._reward_thresh_to_increase_range)
         else:
             self._random_range = random_range
+        if min_distance < self._success_distance_thresh:
+            min_distance = self._success_distance_thresh
+        self._min_distance = min_distance
         self._goal_dist = 0.
         obs_format = "image"
         obs_relative = "ego"
@@ -457,7 +463,7 @@ def run(self):
         gen = self._run_one_goal(goal)
         while not done:
             action = next(gen)
-            if (action.done and self._chain_task_rate > 0
+            if (action.done and action.success and self._chain_task_rate > 0
                     and random.random() < self._chain_task_rate):
                 action.done = False
                 gen = self._run_one_goal(goal, move_distractions=False)
@@ -692,7 +698,7 @@ def _move_obj(self,
                 satisfied = False
             for avoid_loc in avoid_locations:
                 dist = np.linalg.norm(loc - avoid_loc)
-                if dist < self._success_distance_thresh:
+                if dist < self._min_distance:
                     satisfied = False
                     break
             if satisfied or attempts > 10000:

From 5c98ed08a885f091489644bc1f2a24e252fc7e36 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Thu, 21 Jan 2021 14:21:31 -0800
Subject: [PATCH 31/53] allow using speed_goal (besides position goal) in
 GoalTask.

---
 python/social_bot/tasks.py | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index 0eb7bfd1..e3297bf7 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -178,6 +178,7 @@ def __init__(self,
                      'coke_can', 'table', 'car_wheel', 'plastic_cup', 'beer'
                  ],
                  goal_conditioned=False,
+                 speed_goal=False,
                  use_aux_achieved=False,
                  xy_only_aux=False,
                  multi_dim_reward=False,
@@ -218,6 +219,7 @@ def __init__(self,
             goal_name (string): name of the goal in the world
             distraction_list (list of string): a list of model. the model shoud be in gazebo database
             goal_conditioned (bool): if True, each step has -1 reward, unless at goal state, which gives 0.
+            speed_goal (bool): if True, use speed pose etc. together with position as part of goal.
             use_aux_achieved (bool): if True, pull out speed, pose dimensions into a separate
                 field: aux_achieved.  Only valid when goal_conditioned is True.
             xy_only_aux (bool): exclude irrelevant dimensions (z-axis movements) from
@@ -283,6 +285,7 @@ def __init__(self,
             env=env, max_steps=max_steps, reward_weight=reward_weight)
         self._goal_name = goal_name
         self._goal_conditioned = goal_conditioned
+        self._speed_goal = speed_goal
         self._use_aux_achieved = use_aux_achieved
         self._xy_only_aux = xy_only_aux
         self._multi_dim_reward = multi_dim_reward
@@ -639,6 +642,15 @@ def _move_goal(self, goal, agent_loc, agent_dir, move_distractions=True):
             agent_dir=agent_dir,
             is_goal=True,
             avoid_locations=avoid_locations)
+        if self._speed_goal:
+            MAX_SPEED = 4 * 0.8  # -2 to 2
+            xspeed = (0.5 - random.random()) * MAX_SPEED
+            yspeed = (0.5 - random.random()) * MAX_SPEED
+            yawspeed = (0.5 - random.random()) * MAX_SPEED
+            yaw = np.arctan2(yspeed, xspeed)
+            # 0, 1, 2: vel; 3, 4, 5: angular vel; 6: z position; 7, 8, 9: roll pitch yaw
+            self._aux_desired = np.array(
+                [xspeed, yspeed, 0, 0, 0, yawspeed, 0, 0, 0, yaw])
         self._goal_dist += dist
         avoid_locations.append(loc)
         distractions = OrderedDict()
@@ -760,15 +772,21 @@ def generate_goal_conditioned_obs(self, agent):
             # 0, 1, 2: vel; 3, 4, 5: angular vel; 6: z position; 7, 8, 9: roll pitch yaw
             obs['aux_achieved'] = np.concatenate((agent_vel, agent_pose[2:]),
                                                  axis=0)
-            if self._xy_only_aux:
-                # agent speed: 2: z-speed; 3, 4: angular velocities; 5: yaw-vel,
-                # agent pose: 2: z; 3, 4, 5: roll pitch yaw.
-                obs['observation'] = np.concatenate(
-                    (agent_vel[2:5], agent_pose[2:5], flat_obs[14:]), axis=0)
-                obs['aux_achieved'] = np.concatenate(
-                    (agent_vel[0:2], np.expand_dims(agent_vel[5], 0),
-                     np.expand_dims(agent_pose[5], 0)),
-                    axis=0)
+            if self._speed_goal:
+                obs['achieved_goal'] = np.concatenate(
+                    (obs['achieved_goal'], obs['aux_achieved']), axis=0)
+                del obs['aux_achieved']
+                obs['desired_goal'] = np.concatenate(
+                    (obs['desired_goal'], self._aux_desired), axis=0)
+            # if self._xy_only_aux:
+            #     # agent speed: 2: z-speed; 3, 4: angular velocities; 5: yaw-vel,
+            #     # agent pose: 2: z; 3, 4, 5: roll pitch yaw.
+            #     obs['observation'] = np.concatenate(
+            #         (agent_vel[2:5], agent_pose[2:5], flat_obs[14:]), axis=0)
+            #     obs['aux_achieved'] = np.concatenate(
+            #         (agent_vel[0:2], np.expand_dims(agent_vel[5], 0),
+            #          np.expand_dims(agent_pose[5], 0)),
+            #         axis=0)
         return obs
 
     def task_specific_observation(self, agent):

From e454aee0a3b8b62b7716e046cf9b5aa2d539073e Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Tue, 26 Jan 2021 10:06:55 -0800
Subject: [PATCH 32/53] allow setting speed limit in speed goal task, refactor

---
 python/social_bot/tasks.py | 43 ++++++++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 13 deletions(-)

diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index e3297bf7..94103849 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -179,6 +179,7 @@ def __init__(self,
                  ],
                  goal_conditioned=False,
                  speed_goal=False,
+                 speed_goal_limit=1.6,
                  use_aux_achieved=False,
                  xy_only_aux=False,
                  multi_dim_reward=False,
@@ -220,6 +221,7 @@ def __init__(self,
             distraction_list (list of string): a list of model. the model shoud be in gazebo database
             goal_conditioned (bool): if True, each step has -1 reward, unless at goal state, which gives 0.
             speed_goal (bool): if True, use speed pose etc. together with position as part of goal.
+            speed_goal_limit (float): randomly sample speed goal in the range: -limit to +limit.
             use_aux_achieved (bool): if True, pull out speed, pose dimensions into a separate
                 field: aux_achieved.  Only valid when goal_conditioned is True.
             xy_only_aux (bool): exclude irrelevant dimensions (z-axis movements) from
@@ -286,6 +288,7 @@ def __init__(self,
         self._goal_name = goal_name
         self._goal_conditioned = goal_conditioned
         self._speed_goal = speed_goal
+        self._speed_goal_limit = speed_goal_limit
         self._use_aux_achieved = use_aux_achieved
         self._xy_only_aux = xy_only_aux
         self._multi_dim_reward = multi_dim_reward
@@ -434,11 +437,26 @@ def _prepare_teacher_action(self,
     def _within_angle(self, dot):
         return (not self._success_with_angle_requirement) or dot > 0.707
 
+    def _get_agent_aux_dims(self, agent_pose=None, agent_vel=None):
+        if agent_pose is None:
+            agent_pose = np.array(self._agent.get_pose()).flatten()
+        if agent_vel is None:
+            agent_vel = np.array(self._agent.get_velocities()).flatten()
+        # 0, 1, 2: vel; 3, 4, 5: angular vel; 6: z position; 7, 8, 9: roll pitch yaw
+        return np.concatenate((agent_vel, agent_pose[2:]), axis=0)
+
     def _get_goal_dist(self, goal):
         loc, agent_dir = self._get_agent_loc()
         goal_loc, _ = goal.get_pose()
         goal_loc = np.array(goal_loc)
-        dist = np.linalg.norm(loc - goal_loc)
+        _goal_loc = goal_loc.copy()
+        _loc = np.array(loc).copy()
+        if self._speed_goal:
+            _goal_loc = np.concatenate((_goal_loc, self._aux_desired), axis=0)
+            aux = self._get_agent_aux_dims()
+            _loc = np.concatenate((_loc, aux), axis=0)
+
+        dist = np.linalg.norm(_loc - _goal_loc)
         # dir from get_pose is (roll, pitch, yaw)
         dir = np.array([math.cos(agent_dir[2]), math.sin(agent_dir[2])])
         goal_dir = (goal_loc[0:2] - loc[0:2]) / dist
@@ -643,11 +661,12 @@ def _move_goal(self, goal, agent_loc, agent_dir, move_distractions=True):
             is_goal=True,
             avoid_locations=avoid_locations)
         if self._speed_goal:
-            MAX_SPEED = 4 * 0.8  # -2 to 2
+            MAX_SPEED = self._speed_goal_limit * 2
             xspeed = (0.5 - random.random()) * MAX_SPEED
             yspeed = (0.5 - random.random()) * MAX_SPEED
             yawspeed = (0.5 - random.random()) * MAX_SPEED
-            yaw = np.arctan2(yspeed, xspeed)
+            yaw = np.arctan2(
+                yspeed, xspeed)  # numpy.arctan2 is defined as arctan(x0/x1)
             # 0, 1, 2: vel; 3, 4, 5: angular vel; 6: z position; 7, 8, 9: roll pitch yaw
             self._aux_desired = np.array(
                 [xspeed, yspeed, 0, 0, 0, yawspeed, 0, 0, 0, yaw])
@@ -766,18 +785,16 @@ def generate_goal_conditioned_obs(self, agent):
         obs['observation'] = flat_obs
         obs['achieved_goal'] = agent_pose[0:2]
         obs['desired_goal'] = goal_pose[0:2]
-        if self._use_aux_achieved:
+        aux = self._get_agent_aux_dims(agent_pose, agent_vel)
+        if self._speed_goal:
+            obs['achieved_goal'] = np.concatenate((obs['achieved_goal'], aux),
+                                                  axis=0)
+            obs['desired_goal'] = np.concatenate(
+                (obs['desired_goal'], self._aux_desired), axis=0)
+        elif self._use_aux_achieved:
             # distraction objects' x, y coordinates
             obs['observation'] = flat_obs[14:]
-            # 0, 1, 2: vel; 3, 4, 5: angular vel; 6: z position; 7, 8, 9: roll pitch yaw
-            obs['aux_achieved'] = np.concatenate((agent_vel, agent_pose[2:]),
-                                                 axis=0)
-            if self._speed_goal:
-                obs['achieved_goal'] = np.concatenate(
-                    (obs['achieved_goal'], obs['aux_achieved']), axis=0)
-                del obs['aux_achieved']
-                obs['desired_goal'] = np.concatenate(
-                    (obs['desired_goal'], self._aux_desired), axis=0)
+            obs['aux_achieved'] = aux
             # if self._xy_only_aux:
             #     # agent speed: 2: z-speed; 3, 4: angular velocities; 5: yaw-vel,
             #     # agent pose: 2: z; 3, 4, 5: roll pitch yaw.

From 0bdca8cc6246d733ce38b2af9fd452cd16732c9f Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Wed, 7 Apr 2021 15:01:06 -0700
Subject: [PATCH 33/53] more debug info for goal task to output states when
 reward achieved.

---
 python/social_bot/tasks.py | 38 +++++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index 94103849..9a3ae7cc 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -519,6 +519,8 @@ def _run_one_goal(self, goal, move_distractions=True):
                     if self._multi_dim_reward:
                         rewards = np.array([1, -distraction_penalty])
                 logging.debug("yielding reward: " + str(reward))
+                logging.debug("at location: %s, aux: %s", a_loc.astype('|S5'),
+                              self._get_agent_aux_dims().astype('|S5'))
                 done = self._end_episode_after_success
                 agent_sentence = yield self._prepare_teacher_action(
                     reward=reward,
@@ -659,7 +661,8 @@ def _move_goal(self, goal, agent_loc, agent_dir, move_distractions=True):
             agent_loc=agent_loc,
             agent_dir=agent_dir,
             is_goal=True,
-            avoid_locations=avoid_locations)
+            avoid_locations=avoid_locations,
+            name="goal")
         if self._speed_goal:
             MAX_SPEED = self._speed_goal_limit * 2
             xspeed = (0.5 - random.random()) * MAX_SPEED
@@ -685,7 +688,8 @@ def _move_goal(self, goal, agent_loc, agent_dir, move_distractions=True):
                         agent_loc=agent_loc,
                         agent_dir=agent_dir,
                         is_goal=False,
-                        avoid_locations=avoid_locations)
+                        avoid_locations=avoid_locations,
+                        name=item)
                     avoid_locations.append(loc)
 
     def _move_obj(self,
@@ -693,7 +697,8 @@ def _move_obj(self,
                   agent_loc,
                   agent_dir,
                   is_goal=True,
-                  avoid_locations=[]):
+                  avoid_locations=[],
+                  name="Unspecified"):
         if (self.should_use_curriculum_training()
                 and self._percent_full_range_in_curriculum > 0
                 and random.random() < self._percent_full_range_in_curriculum):
@@ -704,6 +709,7 @@ def _move_obj(self,
             self._is_full_range_in_curriculum = False
         attempts = 0
         dist = range
+        _min_distance = self._min_distance
         while True:
             attempts += 1
             dist = random.random() * range
@@ -729,17 +735,26 @@ def _move_obj(self,
                 satisfied = False
             for avoid_loc in avoid_locations:
                 dist = np.linalg.norm(loc - avoid_loc)
-                if dist < self._min_distance:
+                if dist < _min_distance:
                     satisfied = False
                     break
-            if satisfied or attempts > 10000:
+            if satisfied or attempts % 10000 == 0 or attempts > 30000:
                 if not satisfied:
-                    logging.warning(
-                        "Took forever to find satisfying " +
-                        "object location. " +
-                        "agent_loc: {}, range: {}, max_size: {}.".format(
-                            str(agent_loc), str(range),
-                            str(self._max_play_ground_size)))
+                    if attempts <= 30000:
+                        _min_distance /= 2.
+                        logging.warning(
+                            "Took {} times to find satisfying {} "
+                            "location. reducing _min_dist to {}".format(
+                                str(attempts), name, str(_min_distance)))
+                        continue
+                    else:
+                        logging.warning(
+                            "Took forever to find satisfying " +
+                            "{} location. " +
+                            "agent_loc: {}, range: {}, _min_dist: {}, max_size"
+                            ": {}.".format(name, str(agent_loc), str(range),
+                                           str(_min_distance),
+                                           str(self._max_play_ground_size)))
                 break
         if is_goal:
             self._prev_dist = self._initial_dist
@@ -787,6 +802,7 @@ def generate_goal_conditioned_obs(self, agent):
         obs['desired_goal'] = goal_pose[0:2]
         aux = self._get_agent_aux_dims(agent_pose, agent_vel)
         if self._speed_goal:
+            obs['observation'] = flat_obs[14:]
             obs['achieved_goal'] = np.concatenate((obs['achieved_goal'], aux),
                                                   axis=0)
             obs['desired_goal'] = np.concatenate(

From 1b71a778094feea11c9ba524cea00bade37a75a0 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Mon, 17 May 2021 11:52:26 -0700
Subject: [PATCH 34/53] add PushReachTask

---
 python/social_bot/envs/play_ground.py         |   4 +-
 .../model.config                              |  18 ++
 .../wood_cube_30cm_without_offset/model.sdf   |  63 +++++++
 python/social_bot/tasks.py                    | 175 +++++++++++++++++-
 4 files changed, 250 insertions(+), 10 deletions(-)
 create mode 100644 python/social_bot/models/wood_cube_30cm_without_offset/model.config
 create mode 100644 python/social_bot/models/wood_cube_30cm_without_offset/model.sdf

diff --git a/python/social_bot/envs/play_ground.py b/python/social_bot/envs/play_ground.py
index 68ef5f0b..2f63bd4b 100644
--- a/python/social_bot/envs/play_ground.py
+++ b/python/social_bot/envs/play_ground.py
@@ -36,7 +36,7 @@
 from social_bot.envs.gazebo_base import GazeboEnvBase
 from social_bot.teacher import TaskGroup
 from social_bot.teacher import TeacherAction
-from social_bot.tasks import GoalTask, ICubAuxiliaryTask, KickingBallTask, Reaching3D
+from social_bot.tasks import GoalTask, PushReachTask, ICubAuxiliaryTask, KickingBallTask, Reaching3D
 
 
 @gin.configurable
@@ -173,7 +173,7 @@ def __init__(self,
         self._teacher = teacher.Teacher(task_groups_exclusive=False)
         self._has_goal_task = False
         for task in tasks:
-            if task == GoalTask:
+            if task in (GoalTask, PushReachTask):
                 self._has_goal_task = True
             task_group = TaskGroup()
             task_group.add_task(task(env=self, max_steps=max_steps))
diff --git a/python/social_bot/models/wood_cube_30cm_without_offset/model.config b/python/social_bot/models/wood_cube_30cm_without_offset/model.config
new file mode 100644
index 00000000..66ed48ec
--- /dev/null
+++ b/python/social_bot/models/wood_cube_30cm_without_offset/model.config
@@ -0,0 +1,18 @@
+<?xml version="1.0"?>
+
+<model>
+  <name>Wood cube 30cm</name>
+  <version>1.0</version>
+  <sdf version='1.5'>model.sdf</sdf>
+
+  <author>
+    <name>Nate Koenig</name>
+    <email>nate@osrfoundation.org</email>
+  </author>
+
+  <description>
+    A 30cm^3 wooden cube modified from the 10x10x10 cm wooden cube.
+    The 5cm height offset of the internal link has been removed from the original version.
+    Also reduced mass by 0.3, and friction to 1/2.
+  </description>
+</model>
diff --git a/python/social_bot/models/wood_cube_30cm_without_offset/model.sdf b/python/social_bot/models/wood_cube_30cm_without_offset/model.sdf
new file mode 100644
index 00000000..10db372c
--- /dev/null
+++ b/python/social_bot/models/wood_cube_30cm_without_offset/model.sdf
@@ -0,0 +1,63 @@
+<?xml version="1.0" ?>
+<sdf version="1.5">
+  <model name="wood_cube_30cm_without_offset">
+    <link name="link">
+      <inertial>
+        <mass>0.2167</mass>
+        <inertia>
+          <ixx>0.0008611666666666669</ixx>
+          <ixy>0</ixy>
+          <ixz>0</ixz>
+          <iyy>0.0008611666666666669</iyy>
+          <iyz>0</iyz>
+          <izz>0.0008611666666666669</izz>
+        </inertia>
+      </inertial>
+
+      <collision name="collision">
+        <geometry>
+          <box>
+            <size>0.3 0.3 0.3</size>
+          </box>
+        </geometry>
+        <surface>
+          <contact>
+            <!-- Red Pine coefficients for longitudinal axis of the wood
+                 according to:
+                 http://www.fpl.fs.fed.us/documnts/fplgtr/fplgtr113/ch04.pdf -->
+            <poissons_ratio>0.347</poissons_ratio>
+            <elastic_modulus>8.8e+09</elastic_modulus>
+            <ode>
+              <kp>100000</kp>
+              <kd>100</kd>
+              <max_vel>100.0</max_vel>
+              <min_depth>0.001</min_depth>
+            </ode>
+          </contact>
+          <friction>
+            <torsional>
+              <coefficient>0.5</coefficient>
+              <use_patch_radius>0</use_patch_radius>
+              <surface_radius>0.01</surface_radius>
+            </torsional>
+          </friction>
+        </surface>
+      </collision>
+
+      <visual name="visual">
+        <geometry>
+          <box>
+            <size>0.3 0.3 0.3</size>
+          </box>
+        </geometry>
+        <material>
+          <script>
+            <uri>file://media/materials/scripts/gazebo.material</uri>
+            <name>Gazebo/Wood</name>
+          </script>
+        </material>
+      </visual>
+
+    </link>
+  </model>
+</sdf>
diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index 9a3ae7cc..d0dfb48d 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -317,14 +317,16 @@ def __init__(self,
         if random_goal and goal_name not in distraction_list:
             distraction_list.append(goal_name)
         self._distraction_list = distraction_list
-        self._object_list = distraction_list
-        if goal_name and goal_name not in distraction_list:
+        self._object_list = distraction_list.copy()
+        if goal_name and goal_name not in self._object_list:
             self._object_list.append(goal_name)
         self._goals = self._object_list
         self._move_goal_during_episode = move_goal_during_episode
         self._success_with_angle_requirement = success_with_angle_requirement
         if not additional_observation_list:
             additional_observation_list = self._object_list
+        else:
+            self._object_list.extend(additional_observation_list)
         self._additional_observation_list = additional_observation_list
         self._pos_list = list(
             itertools.product(
@@ -422,7 +424,7 @@ def _prepare_teacher_action(self,
         goal_dist = 0.
         if rewards is not None:
             rewards = rewards.astype(np.float32)
-        if done:
+        if self._goal_dist > 0:
             goal_dist = self._goal_dist
             # clear self._goal_dist so it is only output once
             self._goal_dist = 0.
@@ -712,7 +714,8 @@ def _move_obj(self,
         _min_distance = self._min_distance
         while True:
             attempts += 1
-            dist = random.random() * range
+            dist = random.random() * (range - self._success_distance_thresh
+                                      ) + self._success_distance_thresh
             if self._curriculum_target_angle:
                 angle_range = self._random_angle
             else:
@@ -734,13 +737,13 @@ def _move_obj(self,
                     self._max_play_ground_size):  # not within walls
                 satisfied = False
             for avoid_loc in avoid_locations:
-                dist = np.linalg.norm(loc - avoid_loc)
-                if dist < _min_distance:
+                ddist = np.linalg.norm(loc - avoid_loc)
+                if ddist < _min_distance:
                     satisfied = False
                     break
-            if satisfied or attempts % 10000 == 0 or attempts > 30000:
+            if satisfied or attempts % 100 == 0 or attempts > 300:
                 if not satisfied:
-                    if attempts <= 30000:
+                    if attempts <= 300:
                         _min_distance /= 2.
                         logging.warning(
                             "Took {} times to find satisfying {} "
@@ -886,6 +889,162 @@ def task_specific_observation(self, agent):
         return obs
 
 
+@gin.configurable
+class PushReachTask(GoalTask):
+    def __init__(self,
+                 env,
+                 max_steps,
+                 push_only=True,
+                 obj_names=['wood_cube_30cm_without_offset'],
+                 goal_names=['goal_indicator'],
+                 distraction_list=['car_wheel']):
+        """A Push or Push and Reach task.
+
+        We utilize some of the curriculum, distraction obj handling logic in GoalTask.
+
+        Args:
+            push_only (bool): if True, goal positions are for objects to achieve; otherwise,
+                the first goal position is for the agent to achieve.
+            obj_names (list of string): when not empty, it's the names of the objects to be moved.
+            goal_names (list of string): when not empty, these goal objects indicate the goal locations for
+                the objects in obj_names.
+        """
+        self._push_only = push_only
+        self._obj_names = obj_names
+        self._goal_names = goal_names
+        if push_only:
+            assert len(obj_names) == len(goal_names)
+        else:
+            assert len(obj_names) == len(goal_names) - 1
+        super().__init__(
+            env,
+            max_steps,
+            goal_conditioned=True,
+            use_aux_achieved=True,
+            multi_dim_reward=True,
+            end_episode_after_success=True,
+            goal_name=goal_names[0],
+            distraction_list=distraction_list,
+            additional_observation_list=obj_names + goal_names[1:])
+
+    def _random_move_objects(self, random_range=10.0):
+        pass
+
+    def _move_objs(self, obj_names, ap, ad, avoids=[], record_goal_dist=False):
+        """Move objects according to agent location."""
+        obj_positions = avoids
+        if record_goal_dist:
+            self._goal_dist = 0
+            n = 0
+        for obj_name in obj_names:
+            obj = self._world.get_agent(obj_name)
+            p, dist = self._move_obj(
+                obj, ap, ad, avoid_locations=obj_positions, name=obj_name)
+            if record_goal_dist:
+                self._goal_dist += dist
+                n += 1
+            obj_positions.append(p)
+        if record_goal_dist:
+            self._goal_dist /= n
+        return obj_positions
+
+    def _move_goals(self, ap, ad, obj_locs=[]):
+        """Move goal locations according to object locations."""
+        # TODO(lezhao): copy obj_locs into avoids and put ap in avoids,
+        # then find goal location based on each obj location.
+        return self._move_objs(
+            self._goal_names, ap, ad, obj_locs, record_goal_dist=True)
+
+    def _move_distractions(self, ap, ad, avoids=[]):
+        return self._move_objs(self._distraction_list, ap, ad, avoids)
+
+    def _get_obj_loc_pose(self, obj_names):
+        res_pos = np.array([])
+        res_aux = np.array([])
+        for obj_name in obj_names:
+            obj = self._world.get_agent(obj_name)
+            obj_pos, obj_dir = obj.get_pose()
+            res_pos = np.concatenate((res_pos, obj_pos[0:2]))
+            res_aux = np.concatenate((res_aux, obj_pos[2:], obj_dir))
+        return res_pos, res_aux
+
+    def _get_achieved(self):
+        achieved_loc = np.array([])
+        aux_achieved = np.array(self._agent.get_velocities()).flatten()
+        ap, ad = self._get_agent_loc()
+        if self._push_only:
+            aux_achieved = np.concatenate((aux_achieved, ap, ad))
+        else:
+            achieved_loc = np.concatenate((achieved_loc, ap[0:2]))
+            aux_achieved = np.concatenate((aux_achieved, ap[2:], ad))
+        ach_poss, aux_achs = self._get_obj_loc_pose(self._obj_names)
+        achieved_loc = np.concatenate((achieved_loc, ach_poss))
+        # Ignore object aux dimensions for now
+        # aux_achieved = np.concatenate((aux_achieved, aux_achs))
+        return achieved_loc, aux_achieved
+
+    def _get_desired_goal(self):
+        return self._get_obj_loc_pose(self._goal_names)[0]
+
+    def generate_goal_conditioned_obs(self, agent):
+        ach, aux_ach = self._get_achieved()
+        obs = OrderedDict()
+        obs['observation'] = self._get_obj_loc_pose(self._distraction_list)[0]
+        obs['achieved_goal'] = ach
+        obs['desired_goal'] = self._get_desired_goal()
+        obs['aux_achieved'] = aux_ach
+        return obs
+
+    def _compute_reward(self):
+        def l2_dist_close_reward_fn(achieved_goal, goal):
+            return 0 if np.linalg.norm(
+                achieved_goal - goal) < self._success_distance_thresh else -1
+
+        return l2_dist_close_reward_fn(self._get_achieved()[0],
+                                       self._get_desired_goal())
+
+    def _produce_rewards(self, prev_min_dist_to_distraction):
+        ap, ad = self._get_agent_loc()
+        distraction_penalty, prev_min_dist_to_distraction = (
+            self._get_distraction_penalty(ap, 1.,
+                                          prev_min_dist_to_distraction))
+        reward = self._compute_reward()
+        done = reward >= 0
+        if done:
+            distraction_penalty = 0
+            logging.debug("yielding reward: " + str(reward))
+            logging.debug("at location: %s, aux: %s", ap.astype('|S5'),
+                          np.array(ad).astype('|S5'))
+        reward += distraction_penalty
+        rewards = np.array([reward, -distraction_penalty])
+        return self._prepare_teacher_action(
+            reward=reward,
+            sentence="well done",
+            done=done,
+            success=done,
+            rewards=rewards), prev_min_dist_to_distraction
+
+    def _run_one_goal(self, goal=None, move_distractions=None):
+        # The two params are not used here.
+        ap, ad = self._get_agent_loc()
+        obj_positions = self._move_objs(self._obj_names, ap, ad)
+        avoids = self._move_goals(ap, ad, obj_positions)
+        avoids = self._move_distractions(ap, ad, avoids)
+        avoids.clear()
+        steps_since_last_reward = 0
+        prev_min_dist_to_distraction = 100
+        rewards = None  # reward array in multi_dim_reward case
+        while steps_since_last_reward < self._max_steps:
+            steps_since_last_reward += 1
+            (agent_sentence, prev_min_dist_to_distraction
+             ) = self._produce_rewards(prev_min_dist_to_distraction)
+            yield agent_sentence
+        (agent_sentence, prev_min_dist_to_distraction
+         ) = self._produce_rewards(prev_min_dist_to_distraction)
+        agent_sentence.done = True
+        yield agent_sentence
+
+
 @gin.configurable
 class ICubAuxiliaryTask(Task):
     """

From 25b0f893fcbb3e6ee922353508ce4c0ecd695555 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Wed, 26 May 2021 14:18:19 -0700
Subject: [PATCH 35/53] fix goal dist output bug

---
 python/social_bot/tasks.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index d0dfb48d..8c48a12b 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -358,7 +358,7 @@ def __init__(self,
             min_distance = self._success_distance_thresh
         self._min_distance = min_distance
         self._goal_dist = 0.
-        obs_format = "image"
+        obs_format = "image or full_state"
         obs_relative = "ego"
         if use_egocentric_states:
             obs_format = "full_state"
@@ -424,7 +424,9 @@ def _prepare_teacher_action(self,
         goal_dist = 0.
         if rewards is not None:
             rewards = rewards.astype(np.float32)
-        if self._goal_dist > 0:
+        # only output when episode done, otherwise, reward accumulator in tensorboard
+        # may miss it.
+        if done:
             goal_dist = self._goal_dist
             # clear self._goal_dist so it is only output once
             self._goal_dist = 0.
@@ -730,8 +732,6 @@ def _move_obj(self,
                 loc = np.asarray((random.random() * range - range / 2,
                                   random.random() * range - range / 2, 0))
 
-            if is_goal:
-                self._initial_dist = np.linalg.norm(loc - agent_loc)
             satisfied = True
             if (abs(loc[0]) > self._max_play_ground_size or abs(loc[1]) >
                     self._max_play_ground_size):  # not within walls
@@ -758,6 +758,8 @@ def _move_obj(self,
                             ": {}.".format(name, str(agent_loc), str(range),
                                            str(_min_distance),
                                            str(self._max_play_ground_size)))
+                if is_goal:
+                    self._initial_dist = np.linalg.norm(loc - agent_loc)
                 break
         if is_goal:
             self._prev_dist = self._initial_dist
@@ -964,8 +966,9 @@ def _get_obj_loc_pose(self, obj_names):
         for obj_name in obj_names:
             obj = self._world.get_agent(obj_name)
             obj_pos, obj_dir = obj.get_pose()
+            # vel = np.array(obj.get_velocities()).flatten()
             res_pos = np.concatenate((res_pos, obj_pos[0:2]))
-            res_aux = np.concatenate((res_aux, obj_pos[2:], obj_dir))
+            res_aux = np.concatenate((res_aux, obj_pos[2:], obj_dir))  # , vel
         return res_pos, res_aux
 
     def _get_achieved(self):

From 30864bb194d7e63376e2842004c23fb9e5e9b50f Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Tue, 8 Jun 2021 11:30:42 -0700
Subject: [PATCH 36/53] allow obj to be initialized closer to the agent

---
 python/social_bot/tasks.py | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index 61cba68d..620674c5 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -702,6 +702,7 @@ def _move_obj(self,
                   agent_dir,
                   is_goal=True,
                   avoid_locations=[],
+                  close_to_agent=False,
                   name="Unspecified"):
         if (self.should_use_curriculum_training()
                 and self._percent_full_range_in_curriculum > 0
@@ -714,10 +715,13 @@ def _move_obj(self,
         attempts = 0
         dist = range
         _min_distance = self._min_distance
+        _succ_thd = self._success_distance_thresh
+        if close_to_agent:
+            _min_distance = 0.1
+            _succ_thd = 0.1
         while True:
             attempts += 1
-            dist = random.random() * (range - self._success_distance_thresh
-                                      ) + self._success_distance_thresh
+            dist = random.random() * (range - _succ_thd) + _succ_thd
             if self._curriculum_target_angle:
                 angle_range = self._random_angle
             else:
@@ -899,7 +903,8 @@ def __init__(self,
                  push_only=True,
                  obj_names=['wood_cube_30cm_without_offset'],
                  goal_names=['goal_indicator'],
-                 distraction_list=['car_wheel']):
+                 distraction_list=['car_wheel'],
+                 close_to_agent=False):
         """A Push or Push and Reach task.
 
         We utilize some of the curriculum, distraction obj handling logic in GoalTask.
@@ -910,10 +915,12 @@ def __init__(self,
             obj_names (list of string): when not empty, it's the names of the objects to be moved.
             goal_names (list of string): when not empty, these goal objects indicate the goal locations for
                 the objects in obj_names.
+            close_to_agent (bool): whether to initialize object to be pushed closer to the agent.
         """
         self._push_only = push_only
         self._obj_names = obj_names
         self._goal_names = goal_names
+        self._close_to_agent = close_to_agent
         if push_only:
             assert len(obj_names) == len(goal_names)
         else:
@@ -932,7 +939,13 @@ def __init__(self,
     def _random_move_objects(self, random_range=10.0):
         pass
 
-    def _move_objs(self, obj_names, ap, ad, avoids=[], record_goal_dist=False):
+    def _move_objs(self,
+                   obj_names,
+                   ap,
+                   ad,
+                   avoids=[],
+                   record_goal_dist=False,
+                   close_to_agent=False):
         """Move objects according to agent location."""
         obj_positions = avoids
         if record_goal_dist:
@@ -941,7 +954,12 @@ def _move_objs(self, obj_names, ap, ad, avoids=[], record_goal_dist=False):
         for obj_name in obj_names:
             obj = self._world.get_agent(obj_name)
             p, dist = self._move_obj(
-                obj, ap, ad, avoid_locations=obj_positions, name=obj_name)
+                obj,
+                ap,
+                ad,
+                avoid_locations=obj_positions,
+                name=obj_name,
+                close_to_agent=close_to_agent)
             if record_goal_dist:
                 self._goal_dist += dist
                 n += 1
@@ -1030,7 +1048,8 @@ def _produce_rewards(self, prev_min_dist_to_distraction):
     def _run_one_goal(self, goal=None, move_distractions=None):
         # The two params are not used here.
         ap, ad = self._get_agent_loc()
-        obj_positions = self._move_objs(self._obj_names, ap, ad)
+        obj_positions = self._move_objs(
+            self._obj_names, ap, ad, close_to_agent=self._close_to_agent)
         avoids = self._move_goals(ap, ad, obj_positions)
         avoids = self._move_distractions(ap, ad, avoids)
         avoids.clear()

From 4fee570612511aca18ed623f393ac50a268d8a16 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao@horizon.ai>
Date: Tue, 8 Jun 2021 14:55:25 -0700
Subject: [PATCH 37/53] update some doc

---
 README.md | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4f6b3f64..17a3fa88 100644
--- a/README.md
+++ b/README.md
@@ -19,6 +19,11 @@ You also need to install the following packages:
 apt install python3-tk
 ```
 
+## Download
+```bash
+git clone --recursive https://github.com/HorizonRobotics/SocialRobot
+```
+
 ## To compile
 ```bash
 cd REPO_ROOT
@@ -27,7 +32,7 @@ mkdir build
 cd build
 cmake ..
 make -j
-cd REPO_ROOT
+cd ..  # REPO_ROOT
 pip3 install -e .
 ```
 if cmake .. complains about cannot find configuration file provided by "gazebo", run
@@ -165,6 +170,22 @@ It's not so easy, for some task you may need some trials to complete it. An exam
 ### python
 You need to make sure the python you use matches the python found by cmake. You can check this by looking at REPO_ROOT/build/CMakeCache.txt
 
+### Python config cmake errors
+```
+CMake Error at pybind11/tools/FindPythonLibsNew.cmake:95 (message):
+  Python config failure:
+
+Call Stack (most recent call first):
+  pybind11/tools/pybind11Tools.cmake:16 (find_package)
+  pybind11/CMakeLists.txt:33 (include)
+```
+
+Remove the makefiles and build again:
+```bash
+cd REPO_ROOT
+rm -rf build
+```
+
 ### display
 
 You need to make sure your `DISPLAY` environment variable points to a valid display, otherwise camera sensor cannot be created and there will be the following error:

From ed5f7b6da25c67ce9c0700c6a78a891d93e0fe50 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Tue, 15 Jun 2021 11:26:53 -0700
Subject: [PATCH 38/53] allow using obj pose, move obj at least 0.3 away from
 agent.

---
 python/social_bot/tasks.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index 620674c5..bdfdde9b 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -717,8 +717,8 @@ def _move_obj(self,
         _min_distance = self._min_distance
         _succ_thd = self._success_distance_thresh
         if close_to_agent:
-            _min_distance = 0.1
-            _succ_thd = 0.1
+            _min_distance = 0.3
+            _succ_thd = _min_distance
         while True:
             attempts += 1
             dist = random.random() * (range - _succ_thd) + _succ_thd
@@ -904,7 +904,8 @@ def __init__(self,
                  obj_names=['wood_cube_30cm_without_offset'],
                  goal_names=['goal_indicator'],
                  distraction_list=['car_wheel'],
-                 close_to_agent=False):
+                 close_to_agent=False,
+                 use_obj_pose=False):
         """A Push or Push and Reach task.
 
         We utilize some of the curriculum, distraction obj handling logic in GoalTask.
@@ -916,11 +917,13 @@ def __init__(self,
             goal_names (list of string): when not empty, these goal objects indicate the goal locations for
                 the objects in obj_names.
             close_to_agent (bool): whether to initialize object to be pushed closer to the agent.
+            use_obj_pose (bool): include object auxiliary dimensions as input.
         """
         self._push_only = push_only
         self._obj_names = obj_names
         self._goal_names = goal_names
         self._close_to_agent = close_to_agent
+        self._use_obj_pose = use_obj_pose
         if push_only:
             assert len(obj_names) == len(goal_names)
         else:
@@ -1000,8 +1003,8 @@ def _get_achieved(self):
             aux_achieved = np.concatenate((aux_achieved, ap[2:], ad))
         ach_poss, aux_achs = self._get_obj_loc_pose(self._obj_names)
         achieved_loc = np.concatenate((achieved_loc, ach_poss))
-        # Ignore object aux dimensions for now
-        # aux_achieved = np.concatenate((aux_achieved, aux_achs))
+        if self._use_obj_pose:
+            aux_achieved = np.concatenate((aux_achieved, aux_achs))
         return achieved_loc, aux_achieved
 
     def _get_desired_goal(self):

From d68a3ff01409807d60bd346b0322e401e2267e96 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Fri, 18 Jun 2021 22:07:35 -0700
Subject: [PATCH 39/53] allow moving target relative to object position, vs
 agent

---
 python/social_bot/tasks.py | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index bdfdde9b..6cec25fa 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -905,6 +905,7 @@ def __init__(self,
                  goal_names=['goal_indicator'],
                  distraction_list=['car_wheel'],
                  close_to_agent=False,
+                 target_relative_to_obj=False,
                  use_obj_pose=False):
         """A Push or Push and Reach task.
 
@@ -917,11 +918,13 @@ def __init__(self,
             goal_names (list of string): when not empty, these goal objects indicate the goal locations for
                 the objects in obj_names.
             close_to_agent (bool): whether to initialize object to be pushed closer to the agent.
+            target_relative_to_obj (bool): initialize target position relative to object vs agent location.
             use_obj_pose (bool): include object auxiliary dimensions as input.
         """
         self._push_only = push_only
         self._obj_names = obj_names
         self._goal_names = goal_names
+        self._target_relative_to_obj = target_relative_to_obj
         self._close_to_agent = close_to_agent
         self._use_obj_pose = use_obj_pose
         if push_only:
@@ -948,24 +951,34 @@ def _move_objs(self,
                    ad,
                    avoids=[],
                    record_goal_dist=False,
-                   close_to_agent=False):
+                   close_to_agent=False,
+                   target_relative_to_obj=False):
         """Move objects according to agent location."""
         obj_positions = avoids
+        if target_relative_to_obj:
+            assert len(avoids) == len(obj_names), "num objects != num targets"
+            orig_obj_pos = avoids.copy()
+
         if record_goal_dist:
             self._goal_dist = 0
-            n = 0
+        n = 0
         for obj_name in obj_names:
             obj = self._world.get_agent(obj_name)
+            start_pos = ap
+            _avoids = obj_positions
+            if target_relative_to_obj:
+                start_pos = orig_obj_pos[n]
+                _avoids = _avoids + [ap]
             p, dist = self._move_obj(
                 obj,
-                ap,
+                start_pos,
                 ad,
                 avoid_locations=obj_positions,
                 name=obj_name,
                 close_to_agent=close_to_agent)
             if record_goal_dist:
                 self._goal_dist += dist
-                n += 1
+            n += 1
             obj_positions.append(p)
         if record_goal_dist:
             self._goal_dist /= n
@@ -976,7 +989,12 @@ def _move_goals(self, ap, ad, obj_locs=[]):
         # TODO(lezhao): copy obj_locs into avoids and put ap in avoids,
         # then find goal location based on each obj location.
         return self._move_objs(
-            self._goal_names, ap, ad, obj_locs, record_goal_dist=True)
+            self._goal_names,
+            ap,
+            ad,
+            obj_locs,
+            record_goal_dist=True,
+            target_relative_to_obj=self._target_relative_to_obj)
 
     def _move_distractions(self, ap, ad, avoids=[]):
         return self._move_objs(self._distraction_list, ap, ad, avoids)

From 59086484846194fd3143bade74bfddd146f23546 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Thu, 12 Aug 2021 11:42:26 -0700
Subject: [PATCH 40/53] allow turn off multi dim reward in push task; more
 debug printing.

---
 python/social_bot/tasks.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index 6cec25fa..1ab2b0d9 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -904,6 +904,7 @@ def __init__(self,
                  obj_names=['wood_cube_30cm_without_offset'],
                  goal_names=['goal_indicator'],
                  distraction_list=['car_wheel'],
+                 multi_dim_reward=True,
                  close_to_agent=False,
                  target_relative_to_obj=False,
                  use_obj_pose=False):
@@ -924,6 +925,7 @@ def __init__(self,
         self._push_only = push_only
         self._obj_names = obj_names
         self._goal_names = goal_names
+        self._multi_dim_reward = multi_dim_reward
         self._target_relative_to_obj = target_relative_to_obj
         self._close_to_agent = close_to_agent
         self._use_obj_pose = use_obj_pose
@@ -936,7 +938,7 @@ def __init__(self,
             max_steps,
             goal_conditioned=True,
             use_aux_achieved=True,
-            multi_dim_reward=True,
+            multi_dim_reward=multi_dim_reward,
             end_episode_after_success=True,
             goal_name=goal_names[0],
             distraction_list=distraction_list,
@@ -1054,11 +1056,15 @@ def _produce_rewards(self, prev_min_dist_to_distraction):
         done = reward >= 0
         if done:
             distraction_penalty = 0
+            av = np.array(self._agent.get_velocities()).flatten()
             logging.debug("yielding reward: " + str(reward))
-            logging.debug("at location: %s, aux: %s", ap.astype('|S5'),
-                          np.array(ad).astype('|S5'))
+            logging.debug("at location: %s, aux: %s, vel: %s",
+                          ap.astype('|S5'),
+                          np.array(ad).astype('|S5'), av.astype('|S5'))
         reward += distraction_penalty
-        rewards = np.array([reward, -distraction_penalty])
+        rewards = None
+        if self._multi_dim_reward:
+            rewards = np.array([reward, -distraction_penalty])
         return self._prepare_teacher_action(
             reward=reward,
             sentence="well done",
@@ -1076,7 +1082,6 @@ def _run_one_goal(self, goal=None, move_distractions=None):
         avoids.clear()
         steps_since_last_reward = 0
         prev_min_dist_to_distraction = 100
-        rewards = None  # reward array in multi_dim_reward case
         while steps_since_last_reward < self._max_steps:
             steps_since_last_reward += 1
             (agent_sentence, prev_min_dist_to_distraction

From 8bd875373b43dd02d30208176fa097b05bde2ee3 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Mon, 16 Aug 2021 15:01:02 -0700
Subject: [PATCH 41/53] proper seeding of envs for deterministic randomization

---
 pygazebo/pygazebo.cc                  |  6 ++++++
 python/social_bot/envs/gazebo_base.py | 15 +++++++++++++--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/pygazebo/pygazebo.cc b/pygazebo/pygazebo.cc
index cb94ce7c..faccf1e4 100644
--- a/pygazebo/pygazebo.cc
+++ b/pygazebo/pygazebo.cc
@@ -22,6 +22,7 @@
 #include <gazebo/sensors/SensorManager.hh>
 #include <gazebo/sensors/SensorsIface.hh>
 #include <gazebo/util/LogRecord.hh>
+#include <ignition/math/Rand.hh>
 #include <mutex>  // NOLINT
 #include <sstream>
 
@@ -469,6 +470,7 @@ class World {
 
 void Initialize(const std::vector<std::string>& args,
                 int port = 0,
+                int seed = -1,
                 bool quiet = false) {
   if (port != 0) {
     std::string uri = "localhost:" + std::to_string(port);
@@ -477,6 +479,9 @@ void Initialize(const std::vector<std::string>& args,
   if (!gazebo_initialized) {
     gazebo::common::Console::SetQuiet(quiet);
     gazebo::setupServer(args);
+    if (seed >= 0) {
+      ignition::math::Rand::Seed((unsigned int)seed);
+    }
     // gazebo::runWorld uses World::RunLoop(). RunLoop() starts LogWorker()
     // every time. LogWorker will always store something in the buffer when
     // it is started. But we don't have a running LogRecord to collect all
@@ -568,6 +573,7 @@ PYBIND11_MODULE(pygazebo, m) {
         "Initialize",
         py::arg("args") = std::vector<std::string>(),
         py::arg("port") = 0,
+        py::arg("seed") = -1,
         py::arg("quiet") = false);
 
   m.def("close", &Close, "Shutdown everything of gazebo");
diff --git a/python/social_bot/envs/gazebo_base.py b/python/social_bot/envs/gazebo_base.py
index 4642a30d..f284b85f 100644
--- a/python/social_bot/envs/gazebo_base.py
+++ b/python/social_bot/envs/gazebo_base.py
@@ -43,6 +43,8 @@ def __init__(self,
                  world_config=None,
                  sim_time_precision=0.001,
                  port=None,
+                 seed=-1,
+                 use_port_in_gz_seed=True,
                  quiet=False):
         """
         Args:
@@ -52,6 +54,10 @@ def __init__(self,
                 see `_modify_world_xml` for details
              sim_time_precision (float): the time precision of the simulator
              port (int): Gazebo port
+             seed (int): random seed for gazebo, -1 for using the default: PID
+             use_port_in_gz_seed (bool): if True, use port and seed in gazebo seed.
+                Set to False when doing repeatability debugging.  Set to True when
+                running several envs in parallel during regular training.
              quiet (bool) Set quiet output
         """
         os.environ["GAZEBO_MODEL_DATABASE_URI"] = ""
@@ -61,12 +67,17 @@ def __init__(self,
         # This avoids different parallel simulations having the same randomness.
         # When calling from alf, alf.environments.utils.create_environment calls
         # env.seed() afterwards, which is the real seed being used.  Not this one.
-        random.seed(port)
+        _seed = port
+        if seed >= 0:
+            _seed = seed
+            if use_port_in_gz_seed:
+                _seed += port
+        random.seed(_seed)
         self._rendering_process = None
         self._rendering_camera = None
         # the default camera pose for rendering rgb_array, could be override
         self._rendering_cam_pose = "10 -10 6 0 0.4 2.4"
-        gazebo.initialize(port=port, quiet=quiet)
+        gazebo.initialize(port=port, seed=_seed, quiet=quiet)
 
         if world_file:
             world_file_abs_path = os.path.join(social_bot.get_world_dir(),

From 8a15ea9e0aa8815712355cc06dad94dcf7f4eaeb Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Tue, 21 Sep 2021 09:29:04 -0700
Subject: [PATCH 42/53] mode with Push and Reach goals in a single
 PushReachTask.

---
 python/social_bot/tasks.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index 1ab2b0d9..d654e15d 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -900,7 +900,7 @@ class PushReachTask(GoalTask):
     def __init__(self,
                  env,
                  max_steps,
-                 push_only=True,
+                 push_reach=False,
                  obj_names=['wood_cube_30cm_without_offset'],
                  goal_names=['goal_indicator'],
                  distraction_list=['car_wheel'],
@@ -913,7 +913,7 @@ def __init__(self,
         We utilize some of the curriculum, distraction obj handling logic in GoalTask.
 
         Args:
-            push_only (bool): if True, goal positions are for objects to achieve; otherwise,
+            push_reach (bool): if False, push only, goal positions are for objects to achieve; otherwise,
                 the first goal position is for the agent to achieve.
             obj_names (list of string): when not empty, it's the names of the objects to be moved.
             goal_names (list of string): when not empty, these goal objects indicate the goal locations for
@@ -922,14 +922,16 @@ def __init__(self,
             target_relative_to_obj (bool): initialize target position relative to object vs agent location.
             use_obj_pose (bool): include object auxiliary dimensions as input.
         """
-        self._push_only = push_only
+        self._push_reach = push_reach
+        if push_reach:
+            goal_names.append('target_ball')
         self._obj_names = obj_names
         self._goal_names = goal_names
         self._multi_dim_reward = multi_dim_reward
         self._target_relative_to_obj = target_relative_to_obj
         self._close_to_agent = close_to_agent
         self._use_obj_pose = use_obj_pose
-        if push_only:
+        if not push_reach:
             assert len(obj_names) == len(goal_names)
         else:
             assert len(obj_names) == len(goal_names) - 1
@@ -1015,8 +1017,10 @@ def _get_obj_loc_pose(self, obj_names):
     def _get_achieved(self):
         achieved_loc = np.array([])
         aux_achieved = np.array(self._agent.get_velocities()).flatten()
+        # print("gazebo port: ", self._env._port)
+        # print("gazebo aux_ach: ", aux_achieved, flush=True)
         ap, ad = self._get_agent_loc()
-        if self._push_only:
+        if not self._push_reach:
             aux_achieved = np.concatenate((aux_achieved, ap, ad))
         else:
             achieved_loc = np.concatenate((achieved_loc, ap[0:2]))

From b90e6e5ff0c1cb1754b903e6b6adadce8c46a409 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Thu, 28 Apr 2022 13:37:25 -0700
Subject: [PATCH 43/53] update Pioneer tasks (target navigation, push reach) to
 alf config

---
 python/social_bot/envs/play_ground.py |  5 +++--
 python/social_bot/gazebo_agent.py     |  7 ++++---
 python/social_bot/tasks.py            | 17 +++++++++--------
 3 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/python/social_bot/envs/play_ground.py b/python/social_bot/envs/play_ground.py
index 2f63bd4b..615d683e 100644
--- a/python/social_bot/envs/play_ground.py
+++ b/python/social_bot/envs/play_ground.py
@@ -22,7 +22,6 @@
 import random
 import json
 import PIL
-import gin
 import gym
 from gym import spaces
 from absl import logging
@@ -38,8 +37,10 @@
 from social_bot.teacher import TeacherAction
 from social_bot.tasks import GoalTask, PushReachTask, ICubAuxiliaryTask, KickingBallTask, Reaching3D
 
+import alf
 
-@gin.configurable
+
+@alf.configurable
 class PlayGround(GazeboEnvBase):
     """
     This envionment support agent type of pr2_noplugin, pioneer2dx_noplugin,
diff --git a/python/social_bot/gazebo_agent.py b/python/social_bot/gazebo_agent.py
index 88cb8eca..27d02b44 100644
--- a/python/social_bot/gazebo_agent.py
+++ b/python/social_bot/gazebo_agent.py
@@ -17,7 +17,6 @@
 import time
 import random
 import json
-import gin
 import numpy as np
 import PIL.Image
 from collections import OrderedDict
@@ -27,8 +26,10 @@
 import social_bot.pygazebo as gazebo
 from social_bot.tasks import GoalTask
 
+import alf
 
-@gin.configurable
+
+@alf.configurable
 class GazeboAgent():
     """ Class for the agent of gazebo-based SocialRobot enviroments
     """
@@ -429,7 +430,7 @@ def wrap_actions(self, action):
         raise NotImplementedError("wrap_actions not implemented!")
 
 
-@gin.configurable
+@alf.configurable
 class YoubotActionWrapper(ActionWrapper):
     """ This action wrapper transform a new actions to primitive actions.
 
diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index d654e15d..c05da72c 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -19,7 +19,6 @@
 import numpy as np
 import operator
 import os
-import gin
 import itertools
 import random
 import json
@@ -29,6 +28,8 @@
 import social_bot
 from social_bot.teacher import TeacherAction
 
+import alf
+
 
 class Task(object):
     """Base class for Task.
@@ -161,7 +162,7 @@ def _random_move_object(self,
         return np.array(loc)
 
 
-@gin.configurable
+@alf.configurable
 class GoalTask(Task):
     """
     A simple teacher task to find a goal.
@@ -895,7 +896,7 @@ def task_specific_observation(self, agent):
         return obs
 
 
-@gin.configurable
+@alf.configurable
 class PushReachTask(GoalTask):
     def __init__(self,
                  env,
@@ -1097,7 +1098,7 @@ def _run_one_goal(self, goal=None, move_distractions=None):
         yield agent_sentence
 
 
-@gin.configurable
+@alf.configurable
 class ICubAuxiliaryTask(Task):
     """
     An auxiliary task spicified for iCub, to keep the agent from falling down
@@ -1272,7 +1273,7 @@ def task_specific_observation(self, agent):
             return icub_extra_obs
 
 
-@gin.configurable
+@alf.configurable
 class KickingBallTask(Task):
     """
     A simple task to kick a ball so that it rolls into the goal. An
@@ -1419,7 +1420,7 @@ def _move_ball(self, ball):
         ball.set_pose((loc, (0, 0, 0)))
 
 
-@gin.configurable
+@alf.configurable
 class Reaching3D(Task):
     """
     A task to reach a random 3D position with the end effector of a robot arm.
@@ -1504,7 +1505,7 @@ def task_specific_observation(self, agent):
         return obs
 
 
-@gin.configurable
+@alf.configurable
 class PickAndPlace(Task):
     """
     A task to grip an object (a wood cube), move and then place it to the target position.
@@ -1653,7 +1654,7 @@ def task_specific_observation(self, agent):
             (obs, finger_contacts, agent_pose, joints_states), axis=0)
 
 
-@gin.configurable
+@alf.configurable
 class Stack(Task):
     """
     A task to stack several wood cubes. The agent need to grasp the cube and

From 9185962af6112e9a966933c5bcb49c2ab60c81c7 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Thu, 28 Apr 2022 13:43:32 -0700
Subject: [PATCH 44/53] depend on alf for alf.config

---
 setup.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 99957feb..99c47fa8 100644
--- a/setup.py
+++ b/setup.py
@@ -13,9 +13,17 @@
     version='0.0.1',
     install_requires=[
         'gin-config@git+https://github.com/HorizonRobotics/gin-config.git',
-        'gym == 0.12.5', 'numpy >= 1.13.3', 'matplotlib', 'psutil',
-        'lxml >= 3.5.0', 'Pillow', 'absl-py', 'pytest', 'PyUserInput',
-        'python-xlib'
+        'gym == 0.12.5',
+        'numpy >= 1.13.3',
+        'matplotlib',
+        'psutil',
+        'lxml >= 3.5.0',
+        'Pillow',
+        'absl-py',
+        'pytest',
+        'PyUserInput',
+        'python-xlib',
+        'alf@git+https://github.com/HorizonRobotics/alf.git',
     ],
     package_dir={'': 'python'},
     packages=find_packages('python'),

From cc5c764ede2ef8769506a50105ef01c46875b67b Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Thu, 28 Apr 2022 15:02:43 -0700
Subject: [PATCH 45/53] update alf image

---
 .github/workflows/python_test.yml | 2 +-
 setup.py                          | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/python_test.yml b/.github/workflows/python_test.yml
index 237bfcca..0390b91f 100644
--- a/.github/workflows/python_test.yml
+++ b/.github/workflows/python_test.yml
@@ -7,7 +7,7 @@ jobs:
 
     runs-on: ubuntu-18.04
     container:
-      image: horizonrobotics/alf:0.0.3-pytorch
+      image: horizonrobotics/alf:0.0.6-pytorch1.8-python3.7
     steps:
     - uses: actions/checkout@v1
     - name: Install dependencies
diff --git a/setup.py b/setup.py
index 99c47fa8..19872a2a 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,6 @@
         'pytest',
         'PyUserInput',
         'python-xlib',
-        'alf@git+https://github.com/HorizonRobotics/alf.git',
     ],
     package_dir={'': 'python'},
     packages=find_packages('python'),

From 7460efd7d10feeab0ae744a3d1fe0312506585b9 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Fri, 29 Apr 2022 11:30:19 -0700
Subject: [PATCH 46/53] Revert "update alf image"

This reverts commit cc5c764ede2ef8769506a50105ef01c46875b67b.
---
 .github/workflows/python_test.yml | 2 +-
 setup.py                          | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/python_test.yml b/.github/workflows/python_test.yml
index 0390b91f..237bfcca 100644
--- a/.github/workflows/python_test.yml
+++ b/.github/workflows/python_test.yml
@@ -7,7 +7,7 @@ jobs:
 
     runs-on: ubuntu-18.04
     container:
-      image: horizonrobotics/alf:0.0.6-pytorch1.8-python3.7
+      image: horizonrobotics/alf:0.0.3-pytorch
     steps:
     - uses: actions/checkout@v1
     - name: Install dependencies
diff --git a/setup.py b/setup.py
index 19872a2a..99c47fa8 100644
--- a/setup.py
+++ b/setup.py
@@ -23,6 +23,7 @@
         'pytest',
         'PyUserInput',
         'python-xlib',
+        'alf@git+https://github.com/HorizonRobotics/alf.git',
     ],
     package_dir={'': 'python'},
     packages=find_packages('python'),

From 25cc082e29fb89a1b2942e422c233db3f52535f7 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Fri, 29 Apr 2022 11:30:23 -0700
Subject: [PATCH 47/53] Revert "depend on alf for alf.config"

This reverts commit 9185962af6112e9a966933c5bcb49c2ab60c81c7.
---
 setup.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/setup.py b/setup.py
index 99c47fa8..99957feb 100644
--- a/setup.py
+++ b/setup.py
@@ -13,17 +13,9 @@
     version='0.0.1',
     install_requires=[
         'gin-config@git+https://github.com/HorizonRobotics/gin-config.git',
-        'gym == 0.12.5',
-        'numpy >= 1.13.3',
-        'matplotlib',
-        'psutil',
-        'lxml >= 3.5.0',
-        'Pillow',
-        'absl-py',
-        'pytest',
-        'PyUserInput',
-        'python-xlib',
-        'alf@git+https://github.com/HorizonRobotics/alf.git',
+        'gym == 0.12.5', 'numpy >= 1.13.3', 'matplotlib', 'psutil',
+        'lxml >= 3.5.0', 'Pillow', 'absl-py', 'pytest', 'PyUserInput',
+        'python-xlib'
     ],
     package_dir={'': 'python'},
     packages=find_packages('python'),

From 484b0f9c5edd194825d2d773ff827693b64bf3df Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Fri, 29 Apr 2022 11:30:25 -0700
Subject: [PATCH 48/53] Revert "update Pioneer tasks (target navigation, push
 reach) to alf config"

This reverts commit b90e6e5ff0c1cb1754b903e6b6adadce8c46a409.
---
 python/social_bot/envs/play_ground.py |  5 ++---
 python/social_bot/gazebo_agent.py     |  7 +++----
 python/social_bot/tasks.py            | 17 ++++++++---------
 3 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/python/social_bot/envs/play_ground.py b/python/social_bot/envs/play_ground.py
index 615d683e..2f63bd4b 100644
--- a/python/social_bot/envs/play_ground.py
+++ b/python/social_bot/envs/play_ground.py
@@ -22,6 +22,7 @@
 import random
 import json
 import PIL
+import gin
 import gym
 from gym import spaces
 from absl import logging
@@ -37,10 +38,8 @@
 from social_bot.teacher import TeacherAction
 from social_bot.tasks import GoalTask, PushReachTask, ICubAuxiliaryTask, KickingBallTask, Reaching3D
 
-import alf
 
-
-@alf.configurable
+@gin.configurable
 class PlayGround(GazeboEnvBase):
     """
     This envionment support agent type of pr2_noplugin, pioneer2dx_noplugin,
diff --git a/python/social_bot/gazebo_agent.py b/python/social_bot/gazebo_agent.py
index 27d02b44..88cb8eca 100644
--- a/python/social_bot/gazebo_agent.py
+++ b/python/social_bot/gazebo_agent.py
@@ -17,6 +17,7 @@
 import time
 import random
 import json
+import gin
 import numpy as np
 import PIL.Image
 from collections import OrderedDict
@@ -26,10 +27,8 @@
 import social_bot.pygazebo as gazebo
 from social_bot.tasks import GoalTask
 
-import alf
 
-
-@alf.configurable
+@gin.configurable
 class GazeboAgent():
     """ Class for the agent of gazebo-based SocialRobot enviroments
     """
@@ -430,7 +429,7 @@ def wrap_actions(self, action):
         raise NotImplementedError("wrap_actions not implemented!")
 
 
-@alf.configurable
+@gin.configurable
 class YoubotActionWrapper(ActionWrapper):
     """ This action wrapper transform a new actions to primitive actions.
 
diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index c05da72c..d654e15d 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -19,6 +19,7 @@
 import numpy as np
 import operator
 import os
+import gin
 import itertools
 import random
 import json
@@ -28,8 +29,6 @@
 import social_bot
 from social_bot.teacher import TeacherAction
 
-import alf
-
 
 class Task(object):
     """Base class for Task.
@@ -162,7 +161,7 @@ def _random_move_object(self,
         return np.array(loc)
 
 
-@alf.configurable
+@gin.configurable
 class GoalTask(Task):
     """
     A simple teacher task to find a goal.
@@ -896,7 +895,7 @@ def task_specific_observation(self, agent):
         return obs
 
 
-@alf.configurable
+@gin.configurable
 class PushReachTask(GoalTask):
     def __init__(self,
                  env,
@@ -1098,7 +1097,7 @@ def _run_one_goal(self, goal=None, move_distractions=None):
         yield agent_sentence
 
 
-@alf.configurable
+@gin.configurable
 class ICubAuxiliaryTask(Task):
     """
     An auxiliary task spicified for iCub, to keep the agent from falling down
@@ -1273,7 +1272,7 @@ def task_specific_observation(self, agent):
             return icub_extra_obs
 
 
-@alf.configurable
+@gin.configurable
 class KickingBallTask(Task):
     """
     A simple task to kick a ball so that it rolls into the goal. An
@@ -1420,7 +1419,7 @@ def _move_ball(self, ball):
         ball.set_pose((loc, (0, 0, 0)))
 
 
-@alf.configurable
+@gin.configurable
 class Reaching3D(Task):
     """
     A task to reach a random 3D position with the end effector of a robot arm.
@@ -1505,7 +1504,7 @@ def task_specific_observation(self, agent):
         return obs
 
 
-@alf.configurable
+@gin.configurable
 class PickAndPlace(Task):
     """
     A task to grip an object (a wood cube), move and then place it to the target position.
@@ -1654,7 +1653,7 @@ def task_specific_observation(self, agent):
             (obs, finger_contacts, agent_pose, joints_states), axis=0)
 
 
-@alf.configurable
+@gin.configurable
 class Stack(Task):
     """
     A task to stack several wood cubes. The agent need to grasp the cube and

From 611dbd39727e117a0b0de20984e4a10834025c8f Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Wed, 5 Oct 2022 14:45:55 -0700
Subject: [PATCH 49/53] pose only goal for goal task

---
 python/social_bot/tasks.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index d654e15d..35c9b4ee 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -180,6 +180,7 @@ def __init__(self,
                  goal_conditioned=False,
                  speed_goal=False,
                  speed_goal_limit=1.6,
+                 pose_goal=False,
                  use_aux_achieved=False,
                  xy_only_aux=False,
                  multi_dim_reward=False,
@@ -222,6 +223,8 @@ def __init__(self,
             goal_conditioned (bool): if True, each step has -1 reward, unless at goal state, which gives 0.
             speed_goal (bool): if True, use speed pose etc. together with position as part of goal.
             speed_goal_limit (float): randomly sample speed goal in the range: -limit to +limit.
+            pose_goal (bool): When speed_goal is True, if pose_goal is True, speed and everything else is put
+                into observations, instead of aux_achieved, so only pose is in aux_achieved.
             use_aux_achieved (bool): if True, pull out speed, pose dimensions into a separate
                 field: aux_achieved.  Only valid when goal_conditioned is True.
             xy_only_aux (bool): exclude irrelevant dimensions (z-axis movements) from
@@ -289,6 +292,7 @@ def __init__(self,
         self._goal_conditioned = goal_conditioned
         self._speed_goal = speed_goal
         self._speed_goal_limit = speed_goal_limit
+        self._pose_goal = pose_goal
         self._use_aux_achieved = use_aux_achieved
         self._xy_only_aux = xy_only_aux
         self._multi_dim_reward = multi_dim_reward
@@ -458,6 +462,8 @@ def _get_goal_dist(self, goal):
         if self._speed_goal:
             _goal_loc = np.concatenate((_goal_loc, self._aux_desired), axis=0)
             aux = self._get_agent_aux_dims()
+            if self._pose_goal:
+                aux = aux[-1:]  # yaw
             _loc = np.concatenate((_loc, aux), axis=0)
 
         dist = np.linalg.norm(_loc - _goal_loc)
@@ -677,6 +683,8 @@ def _move_goal(self, goal, agent_loc, agent_dir, move_distractions=True):
             # 0, 1, 2: vel; 3, 4, 5: angular vel; 6: z position; 7, 8, 9: roll pitch yaw
             self._aux_desired = np.array(
                 [xspeed, yspeed, 0, 0, 0, yawspeed, 0, 0, 0, yaw])
+            if self._pose_goal:
+                self._aux_desired = np.array([yaw])
         self._goal_dist += dist
         avoid_locations.append(loc)
         distractions = OrderedDict()
@@ -812,6 +820,10 @@ def generate_goal_conditioned_obs(self, agent):
         aux = self._get_agent_aux_dims(agent_pose, agent_vel)
         if self._speed_goal:
             obs['observation'] = flat_obs[14:]
+            if self._pose_goal:
+                obs['observation'] = np.concatenate(
+                    (obs['observation'], aux[:-1]), axis=0)
+                aux = aux[-1:]  # yaw
             obs['achieved_goal'] = np.concatenate((obs['achieved_goal'], aux),
                                                   axis=0)
             obs['desired_goal'] = np.concatenate(

From 96ea226dec05f591babd6b4593a8a58ec1074496 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Thu, 6 Oct 2022 15:57:42 -0700
Subject: [PATCH 50/53] adding ghost pioneer car to indicate goal position and
 orientation in GoalTask

---
 .../pioneer2dx_noplugin_ghost/model-ghost.sdf | 102 ++++++++++++++++++
 .../pioneer2dx_noplugin_ghost/model.config    |  21 ++++
 python/social_bot/tasks.py                    |  21 ++--
 3 files changed, 135 insertions(+), 9 deletions(-)
 create mode 100644 python/social_bot/models/pioneer2dx_noplugin_ghost/model-ghost.sdf
 create mode 100644 python/social_bot/models/pioneer2dx_noplugin_ghost/model.config

diff --git a/python/social_bot/models/pioneer2dx_noplugin_ghost/model-ghost.sdf b/python/social_bot/models/pioneer2dx_noplugin_ghost/model-ghost.sdf
new file mode 100644
index 00000000..de649101
--- /dev/null
+++ b/python/social_bot/models/pioneer2dx_noplugin_ghost/model-ghost.sdf
@@ -0,0 +1,102 @@
+<?xml version="1.0" ?>
+<sdf version="1.5">
+  <model name="pioneer2dx_noplugin_ghost">
+    <static>true</static>
+    <link name="chassis">
+      <pose>0 0 0.16 0 0 0</pose>
+      <inertial>
+        <mass>5.67</mass>
+        <inertia>
+          <ixx>0.07</ixx>
+          <iyy>0.08</iyy>
+          <izz>0.10</izz>
+          <ixy>0</ixy>
+          <ixz>0</ixz>
+          <iyz>0</iyz>
+        </inertia>
+      </inertial>
+      <visual name="visual">
+        <pose>0 0 0.04 0 0 0</pose>
+        <geometry>
+          <mesh>
+            <uri>model://pioneer2dx/meshes/chassis.dae</uri>
+          </mesh>
+        </geometry>
+        <transparency>0.7</transparency>
+      </visual>
+      <visual name="castor_visual">
+        <transparency>0.7</transparency>
+        <pose>-0.200 0 -0.12 0 0 0</pose>
+        <geometry>
+          <sphere>
+            <radius>0.04</radius>
+          </sphere>
+        </geometry>
+        <material>
+          <script>
+            <uri>file://media/materials/scripts/gazebo.material</uri>
+            <name>Gazebo/FlatBlack</name>
+          </script>
+        </material>
+      </visual>
+    </link>
+    <link name="right_wheel">
+      <pose>0.1 -.17 0.11 0 1.5707 1.5707</pose>
+      <inertial>
+        <mass>1.5</mass>
+        <inertia>
+          <ixx>0.0051</ixx>
+          <iyy>0.0051</iyy>
+          <izz>0.0090</izz>
+          <ixy>0</ixy>
+          <ixz>0</ixz>
+          <iyz>0</iyz>
+        </inertia>
+      </inertial>
+      <visual name="visual">
+        <transparency>0.7</transparency>
+        <geometry>
+          <cylinder>
+            <radius>0.11</radius>
+            <length>0.05</length>
+          </cylinder>
+        </geometry>
+        <material>
+          <script>
+            <uri>file://media/materials/scripts/gazebo.material</uri>
+            <name>Gazebo/FlatBlack</name>
+          </script>
+        </material>
+      </visual>
+    </link>
+    <link name="left_wheel">
+      <pose>0.1 .17 0.11 0 1.5707 1.5707</pose>
+      <inertial>
+        <mass>1.5</mass>
+        <inertia>
+          <ixx>0.0051</ixx>
+          <iyy>0.0051</iyy>
+          <izz>0.0090</izz>
+          <ixy>0</ixy>
+          <ixz>0</ixz>
+          <iyz>0</iyz>
+        </inertia>
+      </inertial>
+      <visual name="visual">
+        <transparency>0.7</transparency>
+        <geometry>
+          <cylinder>
+            <radius>0.11</radius>
+            <length>0.05</length>
+          </cylinder>
+        </geometry>
+        <material>
+          <script>
+            <uri>file://media/materials/scripts/gazebo.material</uri>
+            <name>Gazebo/FlatBlack</name>
+          </script>
+        </material>
+      </visual>
+    </link>
+  </model>
+</sdf>
diff --git a/python/social_bot/models/pioneer2dx_noplugin_ghost/model.config b/python/social_bot/models/pioneer2dx_noplugin_ghost/model.config
new file mode 100644
index 00000000..71cee123
--- /dev/null
+++ b/python/social_bot/models/pioneer2dx_noplugin_ghost/model.config
@@ -0,0 +1,21 @@
+<?xml version="1.0"?>
+
+<model>
+  <!-- This is essentially same as the original Pioneer 2DX model without the model plugin -->
+  <name>Pioneer 2DX Ghost</name>
+  <version>1.0</version>
+  <sdf version="1.5">model-ghost.sdf</sdf>
+
+  <author>
+    <name>Nate Koenig</name>
+    <email>nate@osrfoundation.org</email>
+  </author>
+
+  <description>
+    A model of the pioneer 2 dx robot. This is same as https://bitbucket.org/osrf/gazebo_models/src/default/pioneer2dx/
+    with the plugin removed.
+  </description>
+
+  <depend>
+  </depend>
+</model>
diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index 35c9b4ee..7f3570e4 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -173,7 +173,7 @@ class GoalTask(Task):
     def __init__(self,
                  env,
                  max_steps,
-                 goal_name="ball",
+                 goal_name="pioneer2dx_noplugin_ghost",
                  distraction_list=[
                      'coke_can', 'table', 'car_wheel', 'plastic_cup', 'beer'
                  ],
@@ -666,13 +666,6 @@ def _move_goal(self, goal, agent_loc, agent_dir, move_distractions=True):
                     if obj:
                         distraction_loc = obj.get_pose()[0]
                         avoid_locations.append(distraction_loc)
-        loc, dist = self._move_obj(
-            obj=goal,
-            agent_loc=agent_loc,
-            agent_dir=agent_dir,
-            is_goal=True,
-            avoid_locations=avoid_locations,
-            name="goal")
         if self._speed_goal:
             MAX_SPEED = self._speed_goal_limit * 2
             xspeed = (0.5 - random.random()) * MAX_SPEED
@@ -685,6 +678,13 @@ def _move_goal(self, goal, agent_loc, agent_dir, move_distractions=True):
                 [xspeed, yspeed, 0, 0, 0, yawspeed, 0, 0, 0, yaw])
             if self._pose_goal:
                 self._aux_desired = np.array([yaw])
+        loc, dist = self._move_obj(
+            obj=goal,
+            agent_loc=agent_loc,
+            agent_dir=agent_dir,
+            is_goal=True,
+            avoid_locations=avoid_locations,
+            name="goal")
         self._goal_dist += dist
         avoid_locations.append(loc)
         distractions = OrderedDict()
@@ -776,7 +776,10 @@ def _move_obj(self,
         if is_goal:
             self._prev_dist = self._initial_dist
         obj.reset()
-        obj.set_pose((loc, (0, 0, 0)))
+        yaw = 0
+        if is_goal and self._speed_goal:
+            yaw = self._aux_desired[-1]
+        obj.set_pose((loc, (0, 0, yaw)))
         return loc, dist
 
     def _random_move_objects(self, random_range=10.0):

From 3a1700177972bb4a26743d91dba1fe1c45523ff2 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Fri, 7 Oct 2022 15:33:22 -0700
Subject: [PATCH 51/53] allow additionally including x-y speed and yaw
 (together with x-y position) into goal in GoalTask.

---
 python/social_bot/tasks.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index 7f3570e4..7d29b0c1 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -181,6 +181,7 @@ def __init__(self,
                  speed_goal=False,
                  speed_goal_limit=1.6,
                  pose_goal=False,
+                 yaw_and_speed=False,
                  use_aux_achieved=False,
                  xy_only_aux=False,
                  multi_dim_reward=False,
@@ -225,6 +226,8 @@ def __init__(self,
             speed_goal_limit (float): randomly sample speed goal in the range: -limit to +limit.
             pose_goal (bool): When speed_goal is True, if pose_goal is True, speed and everything else is put
                 into observations, instead of aux_achieved, so only pose is in aux_achieved.
+            yaw_and_speed (bool): When speed_goal is True, and not pose_goal, if yaw_and_speed is True,
+                will include yaw and x-y speed into goal, nothing else.
             use_aux_achieved (bool): if True, pull out speed, pose dimensions into a separate
                 field: aux_achieved.  Only valid when goal_conditioned is True.
             xy_only_aux (bool): exclude irrelevant dimensions (z-axis movements) from
@@ -293,6 +296,7 @@ def __init__(self,
         self._speed_goal = speed_goal
         self._speed_goal_limit = speed_goal_limit
         self._pose_goal = pose_goal
+        self._yaw_and_speed = yaw_and_speed
         self._use_aux_achieved = use_aux_achieved
         self._xy_only_aux = xy_only_aux
         self._multi_dim_reward = multi_dim_reward
@@ -464,6 +468,8 @@ def _get_goal_dist(self, goal):
             aux = self._get_agent_aux_dims()
             if self._pose_goal:
                 aux = aux[-1:]  # yaw
+            elif self._yaw_and_speed:
+                aux = np.concatenate((aux[:2], aux[-1:]), axis=0)
             _loc = np.concatenate((_loc, aux), axis=0)
 
         dist = np.linalg.norm(_loc - _goal_loc)
@@ -678,6 +684,8 @@ def _move_goal(self, goal, agent_loc, agent_dir, move_distractions=True):
                 [xspeed, yspeed, 0, 0, 0, yawspeed, 0, 0, 0, yaw])
             if self._pose_goal:
                 self._aux_desired = np.array([yaw])
+            elif self._yaw_and_speed:
+                self._aux_desired = np.array([xspeed, yspeed, yaw])
         loc, dist = self._move_obj(
             obj=goal,
             agent_loc=agent_loc,
@@ -827,6 +835,10 @@ def generate_goal_conditioned_obs(self, agent):
                 obs['observation'] = np.concatenate(
                     (obs['observation'], aux[:-1]), axis=0)
                 aux = aux[-1:]  # yaw
+            elif self._yaw_and_speed:
+                obs['observation'] = np.concatenate(
+                    (obs['observation'], aux[2:-1]), axis=0)
+                aux = np.concatenate((aux[:2], aux[-1:]), axis=0)
             obs['achieved_goal'] = np.concatenate((obs['achieved_goal'], aux),
                                                   axis=0)
             obs['desired_goal'] = np.concatenate(

From 9f13d9816c80ee63220d4c1c797413be07f1a1b2 Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Tue, 11 Oct 2022 13:13:04 -0700
Subject: [PATCH 52/53] goal conditioned dense reward for goal task

---
 python/social_bot/tasks.py | 44 +++++++++++++++++++++++++++++---------
 1 file changed, 34 insertions(+), 10 deletions(-)

diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index 7d29b0c1..b8e40614 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -178,6 +178,7 @@ def __init__(self,
                      'coke_can', 'table', 'car_wheel', 'plastic_cup', 'beer'
                  ],
                  goal_conditioned=False,
+                 goal_reward=1.,
                  speed_goal=False,
                  speed_goal_limit=1.6,
                  pose_goal=False,
@@ -222,6 +223,7 @@ def __init__(self,
             goal_name (string): name of the goal in the world
             distraction_list (list of string): a list of model. the model shoud be in gazebo database
             goal_conditioned (bool): if True, each step has -1 reward, unless at goal state, which gives 0.
+            goal_reward (float): magnitude of the reward for reaching goal during dense reward setting.
             speed_goal (bool): if True, use speed pose etc. together with position as part of goal.
             speed_goal_limit (float): randomly sample speed goal in the range: -limit to +limit.
             pose_goal (bool): When speed_goal is True, if pose_goal is True, speed and everything else is put
@@ -293,6 +295,7 @@ def __init__(self,
             env=env, max_steps=max_steps, reward_weight=reward_weight)
         self._goal_name = goal_name
         self._goal_conditioned = goal_conditioned
+        self._goal_reward = goal_reward
         self._speed_goal = speed_goal
         self._speed_goal_limit = speed_goal_limit
         self._pose_goal = pose_goal
@@ -525,15 +528,21 @@ def _run_one_goal(self, goal, move_distractions=True):
             if dist < self._success_distance_thresh and self._within_angle(
                     dot):
                 # within 45 degrees of the agent direction
-                reward = 1.0 - distraction_penalty
+                reward = 0
                 self._push_reward_queue(max(reward, 0))
                 if self._goal_conditioned:
-                    reward -= 1.
+                    if not self._sparse_reward:
+                        dense_reward = (
+                            self._prev_dist - dist) / self._initial_dist
+                        reward = self._goal_reward + dense_reward
                     if self._multi_dim_reward:
-                        rewards = np.array([0, -distraction_penalty])
+                        rewards = np.array([reward, -distraction_penalty])
+                    reward -= distraction_penalty
                 else:
+                    reward = self._goal_reward - distraction_penalty
                     if self._multi_dim_reward:
-                        rewards = np.array([1, -distraction_penalty])
+                        rewards = np.array(
+                            [self._goal_reward, -distraction_penalty])
                 logging.debug("yielding reward: " + str(reward))
                 logging.debug("at location: %s, aux: %s", a_loc.astype('|S5'),
                               self._get_agent_aux_dims().astype('|S5'))
@@ -575,7 +584,7 @@ def _run_one_goal(self, goal, move_distractions=True):
                     reward = (self._prev_dist - dist) / self._initial_dist
                 if self._multi_dim_reward:
                     rewards = np.array([reward, -distraction_penalty])
-                reward = reward - distraction_penalty
+                reward -= distraction_penalty
                 done = False
                 if distraction_penalty > 0:
                     logging.debug("yielding reward: " + str(reward))
@@ -587,7 +596,7 @@ def _run_one_goal(self, goal, move_distractions=True):
                     sentence=self._goal_name,
                     done=done,
                     rewards=rewards)
-        reward = -1.0
+
         dist, dot, loc, agent_dir = self._get_goal_dist(goal)
         distraction_penalty, prev_min_dist_to_distraction = (
             self._get_distraction_penalty(loc, dot,
@@ -597,16 +606,31 @@ def _run_one_goal(self, goal, move_distractions=True):
         success = False
         if dist < self._success_distance_thresh and self._within_angle(dot):
             success = True
-            reward = 1.0 - distraction_penalty
+            reward = 0
             self._push_reward_queue(max(reward, 0))
             if self._goal_conditioned:
-                reward -= 1.
+                if not self._sparse_reward:
+                    dense_reward = (
+                        self._prev_dist - dist) / self._initial_dist
+                    reward = self._goal_reward + dense_reward
                 if self._multi_dim_reward:
-                    rewards = np.array([0, -distraction_penalty])
+                    rewards = np.array([reward, -distraction_penalty])
+                reward -= distraction_penalty
             else:
+                reward = self._goal_reward - distraction_penalty
                 if self._multi_dim_reward:
-                    rewards = np.array([1, -distraction_penalty])
+                    rewards = np.array(
+                        [self._goal_reward, -distraction_penalty])
         else:
+            if self._sparse_reward:
+                reward = 0
+                if self._goal_conditioned:
+                    reward = -1
+            else:
+                reward = (self._prev_dist - dist) / self._initial_dist
+            if self._multi_dim_reward:
+                rewards = np.array([reward, -distraction_penalty])
+            reward -= distraction_penalty
             self._push_reward_queue(0)
             logging.debug("took more than {} steps".format(
                 str(self._max_steps)))

From 87294068adb99e6d6622733b28c383f88d24d87b Mon Sep 17 00:00:00 2001
From: Le Horizon <le.zhao+git@horizon.ai>
Date: Wed, 12 Oct 2022 10:30:17 -0700
Subject: [PATCH 53/53] allow position and speed goal multiplier in GoalTask to
 be roughly at the same scale as yaw.  It effectively reduces the goal
 threshold, making the goals harder to achieve.

---
 python/social_bot/tasks.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/python/social_bot/tasks.py b/python/social_bot/tasks.py
index b8e40614..7f56c67c 100644
--- a/python/social_bot/tasks.py
+++ b/python/social_bot/tasks.py
@@ -183,6 +183,7 @@ def __init__(self,
                  speed_goal_limit=1.6,
                  pose_goal=False,
                  yaw_and_speed=False,
+                 pos_goal_multiplier=0,
                  use_aux_achieved=False,
                  xy_only_aux=False,
                  multi_dim_reward=False,
@@ -230,6 +231,8 @@ def __init__(self,
                 into observations, instead of aux_achieved, so only pose is in aux_achieved.
             yaw_and_speed (bool): When speed_goal is True, and not pose_goal, if yaw_and_speed is True,
                 will include yaw and x-y speed into goal, nothing else.
+            pos_goal_multiplier (float): multiplier to apply on x-y position dimension before computing goal distance.
+                This is to make sure the effect of the x-y is similar as other dimensions of the goal, especially yaw.
             use_aux_achieved (bool): if True, pull out speed, pose dimensions into a separate
                 field: aux_achieved.  Only valid when goal_conditioned is True.
             xy_only_aux (bool): exclude irrelevant dimensions (z-axis movements) from
@@ -300,6 +303,7 @@ def __init__(self,
         self._speed_goal_limit = speed_goal_limit
         self._pose_goal = pose_goal
         self._yaw_and_speed = yaw_and_speed
+        self._pos_goal_multiplier = pos_goal_multiplier
         self._use_aux_achieved = use_aux_achieved
         self._xy_only_aux = xy_only_aux
         self._multi_dim_reward = multi_dim_reward
@@ -467,8 +471,14 @@ def _get_goal_dist(self, goal):
         _goal_loc = goal_loc.copy()
         _loc = np.array(loc).copy()
         if self._speed_goal:
-            _goal_loc = np.concatenate((_goal_loc, self._aux_desired), axis=0)
-            aux = self._get_agent_aux_dims()
+            aux_g = self._aux_desired.copy()
+            aux = self._get_agent_aux_dims().copy()
+            if self._pos_goal_multiplier > 0:
+                _loc *= self._pos_goal_multiplier
+                _goal_loc *= self._pos_goal_multiplier
+                aux_g[:2] *= self._pos_goal_multiplier * 2
+                aux[:2] *= self._pos_goal_multiplier * 2
+            _goal_loc = np.concatenate((_goal_loc, aux_g), axis=0)
             if self._pose_goal:
                 aux = aux[-1:]  # yaw
             elif self._yaw_and_speed: