From edde2da1087bcb2b32a33faa7aca54745f7474ea Mon Sep 17 00:00:00 2001
From: zhuboli <55901904+zhuboli@users.noreply.github.com>
Date: Tue, 29 Sep 2020 15:16:14 -0700
Subject: [PATCH 1/9] draft_retrace

---
 alf/algorithms/td_loss.py | 31 ++++++++++++++++++++++++++++---
 alf/utils/value_ops.py    | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/alf/algorithms/td_loss.py b/alf/algorithms/td_loss.py
index b72dc592d..6e97a9100 100644
--- a/alf/algorithms/td_loss.py
+++ b/alf/algorithms/td_loss.py
@@ -70,13 +70,16 @@ def __init__(self,
         self._lambda = td_lambda
         self._debug_summaries = debug_summaries
 
-    def forward(self, experience, value, target_value):
+    def forward(self, experience, value, target_value,train_info = None):
         """Cacluate the loss.
 
         The first dimension of all the tensors is time dimension and the second
         dimesion is the batch dimension.
 
         Args:
+            train_info (sac_info or sarsa_info): in order to calculate the importance ratio
+                from info.action_distribution. If no input of train info and lambda is not 
+                0 and 1,it will use multistep method instead of retrace
             experience (Experience): experience collected from ``unroll()`` or
                 a replay buffer. All tensors are time-major.
             value (torch.Tensor): the time-major tensor for the value at each time
@@ -99,7 +102,7 @@ def forward(self, experience, value, target_value):
                 values=target_value,
                 step_types=experience.step_type,
                 discounts=experience.discount * self._gamma)
-        else:
+        elif train_info == None:
             advantages = value_ops.generalized_advantage_estimation(
                 rewards=experience.reward,
                 values=target_value,
@@ -107,7 +110,29 @@ def forward(self, experience, value, target_value):
                 discounts=experience.discount * self._gamma,
                 td_lambda=self._lambda)
             returns = advantages + target_value[:-1]
-
+        else:
+            scope = alf.summary.scope(self.__class__.__name__)       
+            importance_ratio,importance_ratio_clipped = value_ops.action_importance_ratio(
+                action_distribution=train_info.action_distribution,
+                collect_action_distribution=experience.rollout_info.action_distribution,
+                action=experience.action,
+                clipping_mode='capping',
+                importance_ratio_clipping= 0.0,
+                log_prob_clipping= 0.0,
+                scope=scope,
+                check_numerics=False,
+                debug_summaries=True)
+            advantages = value_ops.generalized_advantage_estimation_retrace(
+                importance_ratio = importance_ratio_clipped,
+                rewards=experience.reward,
+                values= value,
+                target_value = target_value,
+                step_types=experience.step_type,
+                discounts=experience.discount * self._gamma,
+                time_major = True,
+                td_lambda=self._lambda)
+            returns = advantages + value[:-1]
+            returns = returns.detach()
         value = value[:-1]
 
         if self._debug_summaries and alf.summary.should_record_summaries():
diff --git a/alf/utils/value_ops.py b/alf/utils/value_ops.py
index a6bf85a23..c41502a40 100644
--- a/alf/utils/value_ops.py
+++ b/alf/utils/value_ops.py
@@ -255,3 +255,36 @@ def generalized_advantage_estimation(rewards,
         advs = advs.transpose(0, 1)
 
     return advs.detach()
+####### add for the retrace method
+def generalized_advantage_estimation_retrace(importance_ratio, discounts, rewards, td_lambda, time_major, values, target_value,step_types):
+    ############## compare the importance_ratio with 1
+    #importance_ratio = torch.min(importance_ratio, torch.tensor(1.))
+    ##### why we need this time_major, just sample distuibution?
+    if not time_major:
+        discounts = discounts.transpose(0, 1)
+        rewards = rewards.transpose(0, 1)
+        values = values.transpose(0, 1)
+        step_types = step_types.transpose(0, 1)
+        importance_ratio = importance_ratio.transpose(0,1)
+        target_value = target_value.transpose(0,1)
+
+    assert values.shape[0] >= 2, ("The sequence length needs to be "
+                                  "at least 2. Got {s}".format(
+                                      s=values.shape[0]))
+
+    #### calcuate the loss not very clear for this function
+    advs = torch.zeros_like(values)
+    is_lasts = (step_types == StepType.LAST).to(dtype=torch.float32)
+    delta = (rewards[1:] + discounts[1:] * target_value[1:] - values[:-1])
+
+           
+    weighted_discounts = discounts[1:] * td_lambda * importance_ratio
+    with torch.no_grad():
+        for t in reversed(range(rewards.shape[0] - 1)):
+            advs[t] = (1 - is_lasts[t]) * \
+                      (delta[t] + weighted_discounts[t] * advs[t + 1])
+        advs = advs[:-1]
+    if not time_major:
+        advs = advs.transpose(0, 1)
+
+    return advs.detach()
\ No newline at end of file

From 394a39aee6846ecb59c9b0805fe1347a00dfe179 Mon Sep 17 00:00:00 2001
From: zhuboli <55901904+zhuboli@users.noreply.github.com>
Date: Thu, 8 Oct 2020 15:31:29 -0700
Subject: [PATCH 2/9] fix retrace

---
 alf/algorithms/ddpg_algorithm.py  |   3 +-
 alf/algorithms/sac_algorithm.py   |   3 +-
 alf/algorithms/sarsa_algorithm.py |   2 +-
 alf/algorithms/td_loss.py         | 168 ++++++++++++++++++++++++++++++
 alf/utils/value_ops.py            |   5 +-
 alf/utils/value_ops_test.py       |  25 +++++
 6 files changed, 199 insertions(+), 7 deletions(-)

diff --git a/alf/algorithms/ddpg_algorithm.py b/alf/algorithms/ddpg_algorithm.py
index 6c5ca7284..98ed6a7cc 100644
--- a/alf/algorithms/ddpg_algorithm.py
+++ b/alf/algorithms/ddpg_algorithm.py
@@ -326,7 +326,8 @@ def calc_loss(self, experience, train_info: DdpgInfo):
             critic_losses[i] = self._critic_losses[i](
                 experience=experience,
                 value=train_info.critic.q_values[:, :, i, ...],
-                target_value=train_info.critic.target_q_values).loss
+                target_value=train_info.critic.target_q_values,
+                train_info = train_info).loss
 
         critic_loss = math_ops.add_n(critic_losses)
 
diff --git a/alf/algorithms/sac_algorithm.py b/alf/algorithms/sac_algorithm.py
index 65633d297..de44c4cf3 100644
--- a/alf/algorithms/sac_algorithm.py
+++ b/alf/algorithms/sac_algorithm.py
@@ -757,7 +757,8 @@ def _calc_critic_loss(self, experience, train_info: SacInfo):
             critic_losses.append(
                 l(experience=experience,
                   value=critic_info.critics[:, :, i, ...],
-                  target_value=critic_info.target_critic).loss)
+                  target_value=critic_info.target_critic,
+                  train_info = train_info).loss)
 
         critic_loss = math_ops.add_n(critic_losses)
 
diff --git a/alf/algorithms/sarsa_algorithm.py b/alf/algorithms/sarsa_algorithm.py
index 86d07a74f..7c22fcb1b 100644
--- a/alf/algorithms/sarsa_algorithm.py
+++ b/alf/algorithms/sarsa_algorithm.py
@@ -435,7 +435,7 @@ def calc_loss(self, experience, info: SarsaInfo):
             target_critic = tensor_utils.tensor_prepend_zero(
                 info.target_critics)
             loss_info = self._critic_losses[i](shifted_experience, critic,
-                                               target_critic)
+                                               target_critic,info)
             critic_losses.append(nest_map(lambda l: l[:-1], loss_info.loss))
 
         critic_loss = math_ops.add_n(critic_losses)
diff --git a/alf/algorithms/td_loss.py b/alf/algorithms/td_loss.py
index 6e97a9100..3e3289f52 100644
--- a/alf/algorithms/td_loss.py
+++ b/alf/algorithms/td_loss.py
@@ -8,6 +8,16 @@
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, # Copyright (c) 2019 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
@@ -23,6 +33,164 @@
 from alf.utils.summary_utils import safe_mean_hist_summary
 
 
+@gin.configurable
+class TDLoss(nn.Module):
+    def __init__(self,
+                 gamma=0.99,
+                 td_error_loss_fn=element_wise_squared_loss,
+                 td_lambda=0.95,
+                 use_retrace=0,
+                 debug_summaries=False,
+                 name="TDLoss"):
+        r"""Create a TDLoss object.
+
+        Let :math:`G_{t:T}` be the bootstaped return from t to T:
+            :math:`G_{t:T} = \sum_{i=t+1}^T \gamma^{t-i-1}R_i + \gamma^{T-t} V(s_T)`
+        If ``td_lambda`` = 1, the target for step t is :math:`G_{t:T}`.
+        If ``td_lambda`` = 0, the target for step t is :math:`G_{t:t+1}`
+        If 0 < ``td_lambda`` < 1, the target for step t is the :math:`\lambda`-return:
+            :math:`G_t^\lambda = (1 - \lambda) \sum_{i=t+1}^{T-1} \lambda^{i-t}G_{t:i} + \lambda^{T-t-1} G_{t:T}`
+        There is a simple relationship between :math:`\lambda`-return and
+        the generalized advantage estimation :math:`\hat{A}^{GAE}_t`:
+            :math:`G_t^\lambda = \hat{A}^{GAE}_t + V(s_t)`
+        where the generalized advantage estimation is defined as:
+            :math:`\hat{A}^{GAE}_t = \sum_{i=t}^{T-1}(\gamma\lambda)^{i-t}(R_{i+1} + \gamma V(s_{i+1}) - V(s_i))`
+        use_retrace = 0 means one step or multi_step loss, use_retrace = 1 means retrace loss
+        References:
+
+        Schulman et al. `High-Dimensional Continuous Control Using Generalized Advantage Estimation
+        <https://arxiv.org/abs/1506.02438>`_
+
+        Sutton et al. `Reinforcement Learning: An Introduction
+        <http://incompleteideas.net/book/the-book.html>`_, Chapter 12, 2018
+
+        Args:
+            gamma (float): A discount factor for future rewards.
+            td_errors_loss_fn (Callable): A function for computing the TD errors
+                loss. This function takes as input the target and the estimated
+                Q values and returns the loss for each element of the batch.
+            td_lambda (float): Lambda parameter for TD-lambda computation.
+            debug_summaries (bool): True if debug summaries should be created.
+            name (str): The name of this loss.
+        """
+        super().__init__()
+
+        self._name = name
+        self._gamma = gamma
+        self._td_error_loss_fn = td_error_loss_fn
+        self._lambda = td_lambda
+        self._debug_summaries = debug_summaries
+        self._use_retrace = use_retrace
+    def forward(self, experience, value, target_value, train_info):
+        """Cacluate the loss.
+
+        The first dimension of all the tensors is time dimension and the second
+        dimesion is the batch dimension.
+
+        Args:
+            experience (Experience): experience collected from ``unroll()`` or
+                a replay buffer. All tensors are time-major.
+            value (torch.Tensor): the time-major tensor for the value at each time
+                step. The loss is between this and the calculated return.
+            target_value (torch.Tensor): the time-major tensor for the value at
+                each time step. This is used to calculate return. ``target_value``
+                can be same as ``value``.
+            train_info (sarsa info, sac info): information used to calcuate importance_ratio
+                or importance_ratio_clipped
+        Returns:
+            LossInfo: with the ``extra`` field same as ``loss``.
+        """
+        if self._lambda == 1.0:
+            returns = value_ops.discounted_return(
+                rewards=experience.reward,
+                values=target_value,
+                step_types=experience.step_type,
+                discounts=experience.discount * self._gamma)
+        elif self._lambda == 0.0:
+            returns = value_ops.one_step_discounted_return(
+                rewards=experience.reward,
+                values=target_value,
+                step_types=experience.step_type,
+                discounts=experience.discount * self._gamma)
+        elif self._use_retrace == 0:
+            advantages = value_ops.generalized_advantage_estimation(
+                rewards=experience.reward,
+                values=target_value,
+                step_types=experience.step_type,
+                discounts=experience.discount * self._gamma,
+                td_lambda=self._lambda)
+            returns = advantages + target_value[:-1]
+        else:
+            scope = alf.summary.scope(self.__class__.__name__)       
+            importance_ratio,importance_ratio_clipped = value_ops.action_importance_ratio(
+                action_distribution=train_info.action_distribution,
+                collect_action_distribution=experience.rollout_info.action_distribution,
+                action=experience.action,
+                clipping_mode='capping',
+                importance_ratio_clipping= 0.0,
+                log_prob_clipping= 0.0,
+                scope=scope,
+                check_numerics=False,
+                debug_summaries=True)
+            advantages = value_ops.generalized_advantage_estimation_retrace(
+                importance_ratio = importance_ratio_clipped,
+                rewards=experience.reward,
+                values= value,
+                target_value = target_value,
+                step_types=experience.step_type,
+                discounts=experience.discount * self._gamma,
+                time_major = True,
+                td_lambda=self._lambda)
+            returns = advantages + value[:-1]
+            returns = returns.detach()
+        value = value[:-1]
+
+        if self._debug_summaries and alf.summary.should_record_summaries():
+            mask = experience.step_type[:-1] != StepType.LAST
+            with alf.summary.scope(self._name):
+
+                def _summarize(v, r, td, suffix):
+                    alf.summary.scalar(
+                        "explained_variance_of_return_by_value" + suffix,
+                        tensor_utils.explained_variance(v, r, mask))
+                    safe_mean_hist_summary('values' + suffix, v, mask)
+                    safe_mean_hist_summary('returns' + suffix, r, mask)
+                    safe_mean_hist_summary("td_error" + suffix, td, mask)
+
+                if value.ndim == 2:
+                    _summarize(value, returns, returns - value, '')
+                else:
+                    td = returns - value
+                    for i in range(value.shape[2]):
+                        suffix = '/' + str(i)
+                        _summarize(value[..., i], returns[..., i], td[..., i],
+                                   suffix)
+
+        loss = self._td_error_loss_fn(returns.detach(), value)
+
+        if loss.ndim == 3:
+            # Multidimensional reward. Average over the critic loss for all dimensions
+            loss = loss.mean(dim=2)
+
+        # The shape of the loss expected by Algorith.update_with_gradient is
+        # [T, B], so we need to augment it with additional zeros.
+        loss = tensor_utils.tensor_extend_zero(loss)
+        return LossInfo(loss=loss, extra=loss)
+either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gin
+import torch
+import torch.nn as nn
+
+import alf
+from alf.data_structures import LossInfo, StepType
+from alf.utils.losses import element_wise_squared_loss
+from alf.utils import tensor_utils, value_ops
+from alf.utils.summary_utils import safe_mean_hist_summary
+
+
 @gin.configurable
 class TDLoss(nn.Module):
     def __init__(self,
diff --git a/alf/utils/value_ops.py b/alf/utils/value_ops.py
index c41502a40..a93d2cb4d 100644
--- a/alf/utils/value_ops.py
+++ b/alf/utils/value_ops.py
@@ -255,11 +255,10 @@ def generalized_advantage_estimation(rewards,
         advs = advs.transpose(0, 1)
 
     return advs.detach()
+
 ####### add for the retrace method
 def generalized_advantage_estimation_retrace(importance_ratio, discounts, rewards, td_lambda, time_major, values, target_value,step_types):
-    ############## compare the importance_ratio with 1
     #importance_ratio = torch.min(importance_ratio, torch.tensor(1.))
-    ##### why we need this time_major, just sample distuibution?
     if not time_major:
         discounts = discounts.transpose(0, 1)
         rewards = rewards.transpose(0, 1)
@@ -271,8 +270,6 @@ def generalized_advantage_estimation_retrace(importance_ratio, discounts, reward
     assert values.shape[0] >= 2, ("The sequence length needs to be "
                                   "at least 2. Got {s}".format(
                                       s=values.shape[0]))
-
-    #### calcuate the loss not very clear for this function
     advs = torch.zeros_like(values)
     is_lasts = (step_types == StepType.LAST).to(dtype=torch.float32)
     delta = (rewards[1:] + discounts[1:] * target_value[1:] - values[:-1])
diff --git a/alf/utils/value_ops_test.py b/alf/utils/value_ops_test.py
index ebd526127..024c12bca 100644
--- a/alf/utils/value_ops_test.py
+++ b/alf/utils/value_ops_test.py
@@ -170,7 +170,32 @@ def test_generalized_advantage_estimation(self):
             discounts=discounts,
             td_lambda=td_lambda,
             expected=expected)
+        
+class GeneralizedAdvantage_retrace_Test(unittest.TestCase):
+    """Tests for alf.utils.value_ops
+    """
 
+    def test_generalized_advantage_estimation_retrace(self):
+        values = torch.tensor([[2.] * 4], dtype=torch.float32)
+        step_types = torch.tensor([[StepType.MID] * 4], dtype=torch.int64)
+        rewards = torch.tensor([[3.] * 4], dtype=torch.float32)
+        discounts = torch.tensor([[0.9] * 4], dtype=torch.float32)
+        td_lambda = 0.6/0.9
+        target_value = torch.tensor([[3.] * 4], dtype=torch.float32)
+        importance_ratio = torch.tensor([[0.8] * 3], dtype=torch.float32)
+        d = 3 * 0.9+ 3 - 2
+        expected = torch.tensor([[  (d * 0.6 * 0.8 ) *0.6 * 0.8+ 0.6 * 0.8 * d + d, d * 0.6 * 0.8 + d, d]],                    
+                                dtype=torch.float32)
+        np.testing.assert_array_almost_equal(
+            value_ops.generalized_advantage_estimation_retrace(
+                rewards=rewards,
+                values=values,
+                target_value = target_value,
+                step_types=step_types,
+                discounts=discounts,
+                td_lambda=td_lambda,
+                importance_ratio = importance_ratio,
+                time_major=False), expected)
 
 if __name__ == '__main__':
     unittest.main()

From 074b5daaac45887bab62c92a42366dde628a89dd Mon Sep 17 00:00:00 2001
From: zhuboli <55901904+zhuboli@users.noreply.github.com>
Date: Thu, 8 Oct 2020 15:33:45 -0700
Subject: [PATCH 3/9] fix retrace

---
 alf/algorithms/td_loss.py | 168 --------------------------------------
 1 file changed, 168 deletions(-)

diff --git a/alf/algorithms/td_loss.py b/alf/algorithms/td_loss.py
index 3e3289f52..281a7ff1c 100644
--- a/alf/algorithms/td_loss.py
+++ b/alf/algorithms/td_loss.py
@@ -8,16 +8,6 @@
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, # Copyright (c) 2019 Horizon Robotics. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
@@ -168,164 +158,6 @@ def _summarize(v, r, td, suffix):
 
         loss = self._td_error_loss_fn(returns.detach(), value)
 
-        if loss.ndim == 3:
-            # Multidimensional reward. Average over the critic loss for all dimensions
-            loss = loss.mean(dim=2)
-
-        # The shape of the loss expected by Algorith.update_with_gradient is
-        # [T, B], so we need to augment it with additional zeros.
-        loss = tensor_utils.tensor_extend_zero(loss)
-        return LossInfo(loss=loss, extra=loss)
-either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gin
-import torch
-import torch.nn as nn
-
-import alf
-from alf.data_structures import LossInfo, StepType
-from alf.utils.losses import element_wise_squared_loss
-from alf.utils import tensor_utils, value_ops
-from alf.utils.summary_utils import safe_mean_hist_summary
-
-
-@gin.configurable
-class TDLoss(nn.Module):
-    def __init__(self,
-                 gamma=0.99,
-                 td_error_loss_fn=element_wise_squared_loss,
-                 td_lambda=0.95,
-                 debug_summaries=False,
-                 name="TDLoss"):
-        r"""Create a TDLoss object.
-
-        Let :math:`G_{t:T}` be the bootstaped return from t to T:
-            :math:`G_{t:T} = \sum_{i=t+1}^T \gamma^{t-i-1}R_i + \gamma^{T-t} V(s_T)`
-        If ``td_lambda`` = 1, the target for step t is :math:`G_{t:T}`.
-        If ``td_lambda`` = 0, the target for step t is :math:`G_{t:t+1}`
-        If 0 < ``td_lambda`` < 1, the target for step t is the :math:`\lambda`-return:
-            :math:`G_t^\lambda = (1 - \lambda) \sum_{i=t+1}^{T-1} \lambda^{i-t}G_{t:i} + \lambda^{T-t-1} G_{t:T}`
-        There is a simple relationship between :math:`\lambda`-return and
-        the generalized advantage estimation :math:`\hat{A}^{GAE}_t`:
-            :math:`G_t^\lambda = \hat{A}^{GAE}_t + V(s_t)`
-        where the generalized advantage estimation is defined as:
-            :math:`\hat{A}^{GAE}_t = \sum_{i=t}^{T-1}(\gamma\lambda)^{i-t}(R_{i+1} + \gamma V(s_{i+1}) - V(s_i))`
-
-        References:
-
-        Schulman et al. `High-Dimensional Continuous Control Using Generalized Advantage Estimation
-        <https://arxiv.org/abs/1506.02438>`_
-
-        Sutton et al. `Reinforcement Learning: An Introduction
-        <http://incompleteideas.net/book/the-book.html>`_, Chapter 12, 2018
-
-        Args:
-            gamma (float): A discount factor for future rewards.
-            td_errors_loss_fn (Callable): A function for computing the TD errors
-                loss. This function takes as input the target and the estimated
-                Q values and returns the loss for each element of the batch.
-            td_lambda (float): Lambda parameter for TD-lambda computation.
-            debug_summaries (bool): True if debug summaries should be created.
-            name (str): The name of this loss.
-        """
-        super().__init__()
-
-        self._name = name
-        self._gamma = gamma
-        self._td_error_loss_fn = td_error_loss_fn
-        self._lambda = td_lambda
-        self._debug_summaries = debug_summaries
-
-    def forward(self, experience, value, target_value,train_info = None):
-        """Cacluate the loss.
-
-        The first dimension of all the tensors is time dimension and the second
-        dimesion is the batch dimension.
-
-        Args:
-            train_info (sac_info or sarsa_info): in order to calculate the importance ratio
-                from info.action_distribution. If no input of train info and lambda is not 
-                0 and 1,it will use multistep method instead of retrace
-            experience (Experience): experience collected from ``unroll()`` or
-                a replay buffer. All tensors are time-major.
-            value (torch.Tensor): the time-major tensor for the value at each time
-                step. The loss is between this and the calculated return.
-            target_value (torch.Tensor): the time-major tensor for the value at
-                each time step. This is used to calculate return. ``target_value``
-                can be same as ``value``.
-        Returns:
-            LossInfo: with the ``extra`` field same as ``loss``.
-        """
-        if self._lambda == 1.0:
-            returns = value_ops.discounted_return(
-                rewards=experience.reward,
-                values=target_value,
-                step_types=experience.step_type,
-                discounts=experience.discount * self._gamma)
-        elif self._lambda == 0.0:
-            returns = value_ops.one_step_discounted_return(
-                rewards=experience.reward,
-                values=target_value,
-                step_types=experience.step_type,
-                discounts=experience.discount * self._gamma)
-        elif train_info == None:
-            advantages = value_ops.generalized_advantage_estimation(
-                rewards=experience.reward,
-                values=target_value,
-                step_types=experience.step_type,
-                discounts=experience.discount * self._gamma,
-                td_lambda=self._lambda)
-            returns = advantages + target_value[:-1]
-        else:
-            scope = alf.summary.scope(self.__class__.__name__)       
-            importance_ratio,importance_ratio_clipped = value_ops.action_importance_ratio(
-                action_distribution=train_info.action_distribution,
-                collect_action_distribution=experience.rollout_info.action_distribution,
-                action=experience.action,
-                clipping_mode='capping',
-                importance_ratio_clipping= 0.0,
-                log_prob_clipping= 0.0,
-                scope=scope,
-                check_numerics=False,
-                debug_summaries=True)
-            advantages = value_ops.generalized_advantage_estimation_retrace(
-                importance_ratio = importance_ratio_clipped,
-                rewards=experience.reward,
-                values= value,
-                target_value = target_value,
-                step_types=experience.step_type,
-                discounts=experience.discount * self._gamma,
-                time_major = True,
-                td_lambda=self._lambda)
-            returns = advantages + value[:-1]
-            returns = returns.detach()
-        value = value[:-1]
-
-        if self._debug_summaries and alf.summary.should_record_summaries():
-            mask = experience.step_type[:-1] != StepType.LAST
-            with alf.summary.scope(self._name):
-
-                def _summarize(v, r, td, suffix):
-                    alf.summary.scalar(
-                        "explained_variance_of_return_by_value" + suffix,
-                        tensor_utils.explained_variance(v, r, mask))
-                    safe_mean_hist_summary('values' + suffix, v, mask)
-                    safe_mean_hist_summary('returns' + suffix, r, mask)
-                    safe_mean_hist_summary("td_error" + suffix, td, mask)
-
-                if value.ndim == 2:
-                    _summarize(value, returns, returns - value, '')
-                else:
-                    td = returns - value
-                    for i in range(value.shape[2]):
-                        suffix = '/' + str(i)
-                        _summarize(value[..., i], returns[..., i], td[..., i],
-                                   suffix)
-
-        loss = self._td_error_loss_fn(returns.detach(), value)
-
         if loss.ndim == 3:
             # Multidimensional reward. Average over the critic loss for all dimensions
             loss = loss.mean(dim=2)

From 07b5929809bf364c26f8f52a60c4da1796178446 Mon Sep 17 00:00:00 2001
From: zhuboli <55901904+zhuboli@users.noreply.github.com>
Date: Thu, 8 Oct 2020 15:34:55 -0700
Subject: [PATCH 4/9] fix retrace

---
 alf/utils/value_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/alf/utils/value_ops.py b/alf/utils/value_ops.py
index a93d2cb4d..2bf336ed2 100644
--- a/alf/utils/value_ops.py
+++ b/alf/utils/value_ops.py
@@ -258,7 +258,7 @@ def generalized_advantage_estimation(rewards,
 
 ####### add for the retrace method
 def generalized_advantage_estimation_retrace(importance_ratio, discounts, rewards, td_lambda, time_major, values, target_value,step_types):
-    #importance_ratio = torch.min(importance_ratio, torch.tensor(1.))
+    
     if not time_major:
         discounts = discounts.transpose(0, 1)
         rewards = rewards.transpose(0, 1)

From 1219ef1325ea66630d33e8670c7ceee4773c9c30 Mon Sep 17 00:00:00 2001
From: zhuboli <55901904+zhuboli@users.noreply.github.com>
Date: Mon, 19 Oct 2020 14:58:04 -0700
Subject: [PATCH 5/9] fix conflicts

---
 alf/algorithms/td_loss.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/alf/algorithms/td_loss.py b/alf/algorithms/td_loss.py
index 281a7ff1c..6c1827719 100644
--- a/alf/algorithms/td_loss.py
+++ b/alf/algorithms/td_loss.py
@@ -29,6 +29,7 @@ def __init__(self,
                  gamma=0.99,
                  td_error_loss_fn=element_wise_squared_loss,
                  td_lambda=0.95,
+                 normalize_target=False,
                  use_retrace=0,
                  debug_summaries=False,
                  name="TDLoss"):
@@ -70,6 +71,8 @@ def __init__(self,
         self._td_error_loss_fn = td_error_loss_fn
         self._lambda = td_lambda
         self._debug_summaries = debug_summaries
+        self._normalize_target = normalize_target
+        self._target_normalizer = None
         self._use_retrace = use_retrace
     def forward(self, experience, value, target_value, train_info):
         """Cacluate the loss.

From 027c817f9a1dbba9a4366edf47fe3452328699f7 Mon Sep 17 00:00:00 2001
From: zhuboli <55901904+zhuboli@users.noreply.github.com>
Date: Fri, 23 Oct 2020 17:25:44 -0700
Subject: [PATCH 6/9] fix retrace

---
 alf/algorithms/sarsa_algorithm.py |  2 +-
 alf/algorithms/td_loss.py         | 36 +++++++++++++++----------------
 alf/utils/value_ops.py            | 31 ++++++++++++++++++++------
 alf/utils/value_ops_test.py       | 18 ++++++++++------
 4 files changed, 54 insertions(+), 33 deletions(-)

diff --git a/alf/algorithms/sarsa_algorithm.py b/alf/algorithms/sarsa_algorithm.py
index 7c22fcb1b..c451b52df 100644
--- a/alf/algorithms/sarsa_algorithm.py
+++ b/alf/algorithms/sarsa_algorithm.py
@@ -435,7 +435,7 @@ def calc_loss(self, experience, info: SarsaInfo):
             target_critic = tensor_utils.tensor_prepend_zero(
                 info.target_critics)
             loss_info = self._critic_losses[i](shifted_experience, critic,
-                                               target_critic,info)
+                                               target_critic, info)
             critic_losses.append(nest_map(lambda l: l[:-1], loss_info.loss))
 
         critic_loss = math_ops.add_n(critic_losses)
diff --git a/alf/algorithms/td_loss.py b/alf/algorithms/td_loss.py
index dc2588ad2..06354f75a 100644
--- a/alf/algorithms/td_loss.py
+++ b/alf/algorithms/td_loss.py
@@ -31,10 +31,7 @@ def __init__(self,
                  td_error_loss_fn=element_wise_squared_loss,
                  td_lambda=0.95,
                  normalize_target=False,
- some-feature-retrace
-                 use_retrace=0,
-
- pytorch
+                 use_retrace=False,
                  debug_summaries=False,
                  name="TDLoss"):
         r"""Create a TDLoss object.
@@ -51,6 +48,7 @@ def __init__(self,
         where the generalized advantage estimation is defined as:
             :math:`\hat{A}^{GAE}_t = \sum_{i=t}^{T-1}(\gamma\lambda)^{i-t}(R_{i+1} + \gamma V(s_{i+1}) - V(s_i))`
         use_retrace = 0 means one step or multi_step loss, use_retrace = 1 means retrace loss
+            :math:`\mathcal{R} Q(x, a):=Q(x, a)+\mathbb{E}_{\mu}\left[\sum_{t \geq 0} \gamma^{t}\left(\prod_{s=1}^{t} c_{s}\right)\left(r_{t}+\gamma \mathbb{E}_{\pi} Q\left(x_{t+1}, \cdot\right)-Q\left(x_{t}, a_{t}\right)\right)\right]`
         References:
 
         Schulman et al. `High-Dimensional Continuous Control Using Generalized Advantage Estimation
@@ -59,6 +57,9 @@ def __init__(self,
         Sutton et al. `Reinforcement Learning: An Introduction
         <http://incompleteideas.net/book/the-book.html>`_, Chapter 12, 2018
 
+        Remi Munos et al. `Safe and efficient off-policy reinforcement learning
+        <https://arxiv.org/pdf/1606.02647.pdf>`_
+
         Args:
             gamma (float): A discount factor for future rewards.
             td_errors_loss_fn (Callable): A function for computing the TD errors
@@ -80,13 +81,9 @@ def __init__(self,
         self._debug_summaries = debug_summaries
         self._normalize_target = normalize_target
         self._target_normalizer = None
- some-feature-retrace
         self._use_retrace = use_retrace
-    def forward(self, experience, value, target_value, train_info):
 
-
-    def forward(self, experience, value, target_value):
- pytorch
+    def forward(self, experience, value, target_value, train_info):
         """Cacluate the loss.
 
         The first dimension of all the tensors is time dimension and the second
@@ -117,7 +114,7 @@ def forward(self, experience, value, target_value):
                 values=target_value,
                 step_types=experience.step_type,
                 discounts=experience.discount * self._gamma)
-        elif self._use_retrace == 0:
+        elif self._use_retrace == False:
             advantages = value_ops.generalized_advantage_estimation(
                 rewards=experience.reward,
                 values=target_value,
@@ -126,25 +123,26 @@ def forward(self, experience, value, target_value):
                 td_lambda=self._lambda)
             returns = advantages + target_value[:-1]
         else:
-            scope = alf.summary.scope(self.__class__.__name__)       
-            importance_ratio,importance_ratio_clipped = value_ops.action_importance_ratio(
+            scope = alf.summary.scope(self.__class__.__name__)
+            importance_ratio, importance_ratio_clipped = value_ops.action_importance_ratio(
                 action_distribution=train_info.action_distribution,
-                collect_action_distribution=experience.rollout_info.action_distribution,
+                collect_action_distribution=experience.rollout_info.
+                action_distribution,
                 action=experience.action,
                 clipping_mode='capping',
-                importance_ratio_clipping= 0.0,
-                log_prob_clipping= 0.0,
+                importance_ratio_clipping=0.0,
+                log_prob_clipping=0.0,
                 scope=scope,
                 check_numerics=False,
                 debug_summaries=True)
             advantages = value_ops.generalized_advantage_estimation_retrace(
-                importance_ratio = importance_ratio_clipped,
+                importance_ratio=importance_ratio_clipped,
                 rewards=experience.reward,
-                values= value,
-                target_value = target_value,
+                values=value,
+                target_value=target_value,
                 step_types=experience.step_type,
                 discounts=experience.discount * self._gamma,
-                time_major = True,
+                time_major=True,
                 td_lambda=self._lambda)
             returns = advantages + value[:-1]
             returns = returns.detach()
diff --git a/alf/utils/value_ops.py b/alf/utils/value_ops.py
index 2bf336ed2..75f379d1f 100644
--- a/alf/utils/value_ops.py
+++ b/alf/utils/value_ops.py
@@ -256,16 +256,36 @@ def generalized_advantage_estimation(rewards,
 
     return advs.detach()
 
+
 ####### add for the retrace method
-def generalized_advantage_estimation_retrace(importance_ratio, discounts, rewards, td_lambda, time_major, values, target_value,step_types):
-    
+def generalized_advantage_estimation_retrace(importance_ratio, discounts,
+                                             rewards, td_lambda, time_major,
+                                             values, target_value, step_types):
+    """
+    compute the generalized advantage estimation for retrace method. Main change is adding 
+    importance ratio
+
+    Args:
+        importance_ratio: shape is [T], scalar between [0,1]. representing importance ratio
+        rewards (Tensor): shape is [T, B] (or [T]) representing rewards.
+        values (Tensor): shape is [T,B] (or [T]) representing values.
+        step_types (Tensor): shape is [T,B] (or [T]) representing step types.
+        discounts (Tensor): shape is [T, B] (or [T]) representing discounts.
+        td_lambda (float): A scalar between [0, 1]. It's used for variance
+            reduction in temporal difference.
+        time_major (bool): Whether input tensors are time major.
+            False means input tensors have shape [B, T].
+    Returns:
+        A tensor with shape [T-1, B] representing advantages. Shape is [B, T-1]
+        when time_major is false.        
+    """
     if not time_major:
         discounts = discounts.transpose(0, 1)
         rewards = rewards.transpose(0, 1)
         values = values.transpose(0, 1)
         step_types = step_types.transpose(0, 1)
-        importance_ratio = importance_ratio.transpose(0,1)
-        target_value = target_value.transpose(0,1)
+        importance_ratio = importance_ratio.transpose(0, 1)
+        target_value = target_value.transpose(0, 1)
 
     assert values.shape[0] >= 2, ("The sequence length needs to be "
                                   "at least 2. Got {s}".format(
@@ -274,7 +294,6 @@ def generalized_advantage_estimation_retrace(importance_ratio, discounts, reward
     is_lasts = (step_types == StepType.LAST).to(dtype=torch.float32)
     delta = (rewards[1:] + discounts[1:] * target_value[1:] - values[:-1])
 
-           
     weighted_discounts = discounts[1:] * td_lambda * importance_ratio
     with torch.no_grad():
         for t in reversed(range(rewards.shape[0] - 1)):
@@ -284,4 +303,4 @@ def generalized_advantage_estimation_retrace(importance_ratio, discounts, reward
     if not time_major:
         advs = advs.transpose(0, 1)
 
-    return advs.detach()
\ No newline at end of file
+    return advs.detach()
diff --git a/alf/utils/value_ops_test.py b/alf/utils/value_ops_test.py
index 024c12bca..106f0d58e 100644
--- a/alf/utils/value_ops_test.py
+++ b/alf/utils/value_ops_test.py
@@ -170,7 +170,8 @@ def test_generalized_advantage_estimation(self):
             discounts=discounts,
             td_lambda=td_lambda,
             expected=expected)
-        
+
+
 class GeneralizedAdvantage_retrace_Test(unittest.TestCase):
     """Tests for alf.utils.value_ops
     """
@@ -180,22 +181,25 @@ def test_generalized_advantage_estimation_retrace(self):
         step_types = torch.tensor([[StepType.MID] * 4], dtype=torch.int64)
         rewards = torch.tensor([[3.] * 4], dtype=torch.float32)
         discounts = torch.tensor([[0.9] * 4], dtype=torch.float32)
-        td_lambda = 0.6/0.9
+        td_lambda = 0.6 / 0.9
         target_value = torch.tensor([[3.] * 4], dtype=torch.float32)
         importance_ratio = torch.tensor([[0.8] * 3], dtype=torch.float32)
-        d = 3 * 0.9+ 3 - 2
-        expected = torch.tensor([[  (d * 0.6 * 0.8 ) *0.6 * 0.8+ 0.6 * 0.8 * d + d, d * 0.6 * 0.8 + d, d]],                    
-                                dtype=torch.float32)
+        d = 3 * 0.9 + 3 - 2
+        expected = torch.tensor(
+            [[(d * 0.6 * 0.8) * 0.6 * 0.8 + 0.6 * 0.8 * d + d,
+              d * 0.6 * 0.8 + d, d]],
+            dtype=torch.float32)
         np.testing.assert_array_almost_equal(
             value_ops.generalized_advantage_estimation_retrace(
                 rewards=rewards,
                 values=values,
-                target_value = target_value,
+                target_value=target_value,
                 step_types=step_types,
                 discounts=discounts,
                 td_lambda=td_lambda,
-                importance_ratio = importance_ratio,
+                importance_ratio=importance_ratio,
                 time_major=False), expected)
 
+
 if __name__ == '__main__':
     unittest.main()

From 23d59456ce365a6c8ff7fd3f45be29763ca712ab Mon Sep 17 00:00:00 2001
From: zhuboli <55901904+zhuboli@users.noreply.github.com>
Date: Tue, 27 Oct 2020 18:41:28 -0700
Subject: [PATCH 7/9] still need merge advantage function

---
 alf/algorithms/td_loss.py | 14 +++++++++-----
 alf/utils/value_ops.py    |  4 ++--
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/alf/algorithms/td_loss.py b/alf/algorithms/td_loss.py
index 06354f75a..44b6379ef 100644
--- a/alf/algorithms/td_loss.py
+++ b/alf/algorithms/td_loss.py
@@ -47,7 +47,7 @@ def __init__(self,
             :math:`G_t^\lambda = \hat{A}^{GAE}_t + V(s_t)`
         where the generalized advantage estimation is defined as:
             :math:`\hat{A}^{GAE}_t = \sum_{i=t}^{T-1}(\gamma\lambda)^{i-t}(R_{i+1} + \gamma V(s_{i+1}) - V(s_i))`
-        use_retrace = 0 means one step or multi_step loss, use_retrace = 1 means retrace loss
+        use_retrace = False means one step or multi_step loss, use_retrace = True means retrace loss
             :math:`\mathcal{R} Q(x, a):=Q(x, a)+\mathbb{E}_{\mu}\left[\sum_{t \geq 0} \gamma^{t}\left(\prod_{s=1}^{t} c_{s}\right)\left(r_{t}+\gamma \mathbb{E}_{\pi} Q\left(x_{t+1}, \cdot\right)-Q\left(x_{t}, a_{t}\right)\right)\right]`
         References:
 
@@ -97,8 +97,11 @@ def forward(self, experience, value, target_value, train_info):
             target_value (torch.Tensor): the time-major tensor for the value at
                 each time step. This is used to calculate return. ``target_value``
                 can be same as ``value``.
-            train_info (sarsa info, sac info): information used to calcuate importance_ratio
-                or importance_ratio_clipped
+            train_info : train_info includes action distrbution, actor, critic and
+                other information. Different algorithm may have different info inside.
+                For the retrace method, we can use SarsaInfo, SacInfo or DdpgInfo as train_info 
+                for Sac, Sarsa or Ddpg algorithm. Adding train_info to calculate importance_ratio
+                and importance_ratio_clipped.               
         Returns:
             LossInfo: with the ``extra`` field same as ``loss``.
         """
@@ -124,7 +127,8 @@ def forward(self, experience, value, target_value, train_info):
             returns = advantages + target_value[:-1]
         else:
             scope = alf.summary.scope(self.__class__.__name__)
-            importance_ratio, importance_ratio_clipped = value_ops.action_importance_ratio(
+            importance_ratio, importance_ratio_clipped = value_ops. \
+            action_importance_ratio(
                 action_distribution=train_info.action_distribution,
                 collect_action_distribution=experience.rollout_info.
                 action_distribution,
@@ -134,7 +138,7 @@ def forward(self, experience, value, target_value, train_info):
                 log_prob_clipping=0.0,
                 scope=scope,
                 check_numerics=False,
-                debug_summaries=True)
+                debug_summaries=self._debug_summaries)
             advantages = value_ops.generalized_advantage_estimation_retrace(
                 importance_ratio=importance_ratio_clipped,
                 rewards=experience.reward,
diff --git a/alf/utils/value_ops.py b/alf/utils/value_ops.py
index 75f379d1f..66e685027 100644
--- a/alf/utils/value_ops.py
+++ b/alf/utils/value_ops.py
@@ -257,7 +257,7 @@ def generalized_advantage_estimation(rewards,
     return advs.detach()
 
 
-####### add for the retrace method
+# add for the retrace method
 def generalized_advantage_estimation_retrace(importance_ratio, discounts,
                                              rewards, td_lambda, time_major,
                                              values, target_value, step_types):
@@ -266,7 +266,7 @@ def generalized_advantage_estimation_retrace(importance_ratio, discounts,
     importance ratio
 
     Args:
-        importance_ratio: shape is [T], scalar between [0,1]. representing importance ratio
+        importance_ratio: shape is [T], scalar between [0,1]. Representing importance ratio
         rewards (Tensor): shape is [T, B] (or [T]) representing rewards.
         values (Tensor): shape is [T,B] (or [T]) representing values.
         step_types (Tensor): shape is [T,B] (or [T]) representing step types.

From 5cafd48932f690bf16b1f9c17d55b33eb788b39f Mon Sep 17 00:00:00 2001
From: zhuboli <55901904+zhuboli@users.noreply.github.com>
Date: Thu, 26 Nov 2020 07:51:34 +0800
Subject: [PATCH 8/9] merge function and fix bug

---
 .my_venv/bin/activate               | 76 +++++++++++++++++++++++++++++
 .my_venv/bin/activate.csh           | 37 ++++++++++++++
 .my_venv/bin/activate.fish          | 75 ++++++++++++++++++++++++++++
 .my_venv/bin/easy_install           | 10 ++++
 .my_venv/bin/easy_install-3.7       | 10 ++++
 .my_venv/bin/pip                    | 10 ++++
 .my_venv/bin/pip3                   | 10 ++++
 .my_venv/bin/pip3.7                 | 10 ++++
 .my_venv/bin/python                 |  1 +
 .my_venv/bin/python3                |  1 +
 .my_venv/pyvenv.cfg                 |  3 ++
 alf/.vscode/launch.json             | 22 +++++++++
 alf/algorithms/actor_critic_loss.py |  5 +-
 alf/algorithms/ppo_algorithm.py     |  3 ++
 alf/algorithms/td_loss.py           | 22 ++++++++-
 alf/examples/carla.gin              |  4 +-
 alf/utils/value_ops.py              | 32 ++++++++----
 alf/utils/value_ops_test.py         | 28 +++++++++--
 setup.py                            |  2 +-
 19 files changed, 341 insertions(+), 20 deletions(-)
 create mode 100644 .my_venv/bin/activate
 create mode 100644 .my_venv/bin/activate.csh
 create mode 100644 .my_venv/bin/activate.fish
 create mode 100755 .my_venv/bin/easy_install
 create mode 100755 .my_venv/bin/easy_install-3.7
 create mode 100755 .my_venv/bin/pip
 create mode 100755 .my_venv/bin/pip3
 create mode 100755 .my_venv/bin/pip3.7
 create mode 120000 .my_venv/bin/python
 create mode 120000 .my_venv/bin/python3
 create mode 100644 .my_venv/pyvenv.cfg
 create mode 100644 alf/.vscode/launch.json

diff --git a/.my_venv/bin/activate b/.my_venv/bin/activate
new file mode 100644
index 000000000..1db2cc311
--- /dev/null
+++ b/.my_venv/bin/activate
@@ -0,0 +1,76 @@
+# This file must be used with "source bin/activate" *from bash*
+# you cannot run it directly
+
+deactivate () {
+    # reset old environment variables
+    if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then
+        PATH="${_OLD_VIRTUAL_PATH:-}"
+        export PATH
+        unset _OLD_VIRTUAL_PATH
+    fi
+    if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then
+        PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}"
+        export PYTHONHOME
+        unset _OLD_VIRTUAL_PYTHONHOME
+    fi
+
+    # This should detect bash and zsh, which have a hash command that must
+    # be called to get it to forget past commands.  Without forgetting
+    # past commands the $PATH changes we made may not be respected
+    if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then
+        hash -r
+    fi
+
+    if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then
+        PS1="${_OLD_VIRTUAL_PS1:-}"
+        export PS1
+        unset _OLD_VIRTUAL_PS1
+    fi
+
+    unset VIRTUAL_ENV
+    if [ ! "${1:-}" = "nondestructive" ] ; then
+    # Self destruct!
+        unset -f deactivate
+    fi
+}
+
+# unset irrelevant variables
+deactivate nondestructive
+
+VIRTUAL_ENV="/Users/lizhubo/workspace/alf/.my_venv"
+export VIRTUAL_ENV
+
+_OLD_VIRTUAL_PATH="$PATH"
+PATH="$VIRTUAL_ENV/bin:$PATH"
+export PATH
+
+# unset PYTHONHOME if set
+# this will fail if PYTHONHOME is set to the empty string (which is bad anyway)
+# could use `if (set -u; : $PYTHONHOME) ;` in bash
+if [ -n "${PYTHONHOME:-}" ] ; then
+    _OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}"
+    unset PYTHONHOME
+fi
+
+if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then
+    _OLD_VIRTUAL_PS1="${PS1:-}"
+    if [ "x(.my_venv) " != x ] ; then
+	PS1="(.my_venv) ${PS1:-}"
+    else
+    if [ "`basename \"$VIRTUAL_ENV\"`" = "__" ] ; then
+        # special case for Aspen magic directories
+        # see http://www.zetadev.com/software/aspen/
+        PS1="[`basename \`dirname \"$VIRTUAL_ENV\"\``] $PS1"
+    else
+        PS1="(`basename \"$VIRTUAL_ENV\"`)$PS1"
+    fi
+    fi
+    export PS1
+fi
+
+# This should detect bash and zsh, which have a hash command that must
+# be called to get it to forget past commands.  Without forgetting
+# past commands the $PATH changes we made may not be respected
+if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then
+    hash -r
+fi
diff --git a/.my_venv/bin/activate.csh b/.my_venv/bin/activate.csh
new file mode 100644
index 000000000..ec285bc16
--- /dev/null
+++ b/.my_venv/bin/activate.csh
@@ -0,0 +1,37 @@
+# This file must be used with "source bin/activate.csh" *from csh*.
+# You cannot run it directly.
+# Created by Davide Di Blasi <davidedb@gmail.com>.
+# Ported to Python 3.3 venv by Andrew Svetlov <andrew.svetlov@gmail.com>
+
+alias deactivate 'test $?_OLD_VIRTUAL_PATH != 0 && setenv PATH "$_OLD_VIRTUAL_PATH" && unset _OLD_VIRTUAL_PATH; rehash; test $?_OLD_VIRTUAL_PROMPT != 0 && set prompt="$_OLD_VIRTUAL_PROMPT" && unset _OLD_VIRTUAL_PROMPT; unsetenv VIRTUAL_ENV; test "\!:*" != "nondestructive" && unalias deactivate'
+
+# Unset irrelevant variables.
+deactivate nondestructive
+
+setenv VIRTUAL_ENV "/Users/lizhubo/workspace/alf/.my_venv"
+
+set _OLD_VIRTUAL_PATH="$PATH"
+setenv PATH "$VIRTUAL_ENV/bin:$PATH"
+
+
+set _OLD_VIRTUAL_PROMPT="$prompt"
+
+if (! "$?VIRTUAL_ENV_DISABLE_PROMPT") then
+    if (".my_venv" != "") then
+        set env_name = ".my_venv"
+    else
+        if (`basename "VIRTUAL_ENV"` == "__") then
+            # special case for Aspen magic directories
+            # see http://www.zetadev.com/software/aspen/
+            set env_name = `basename \`dirname "$VIRTUAL_ENV"\``
+        else
+            set env_name = `basename "$VIRTUAL_ENV"`
+        endif
+    endif
+    set prompt = "[$env_name] $prompt"
+    unset env_name
+endif
+
+alias pydoc python -m pydoc
+
+rehash
diff --git a/.my_venv/bin/activate.fish b/.my_venv/bin/activate.fish
new file mode 100644
index 000000000..54a7ffcfb
--- /dev/null
+++ b/.my_venv/bin/activate.fish
@@ -0,0 +1,75 @@
+# This file must be used with ". bin/activate.fish" *from fish* (http://fishshell.org)
+# you cannot run it directly
+
+function deactivate  -d "Exit virtualenv and return to normal shell environment"
+    # reset old environment variables
+    if test -n "$_OLD_VIRTUAL_PATH"
+        set -gx PATH $_OLD_VIRTUAL_PATH
+        set -e _OLD_VIRTUAL_PATH
+    end
+    if test -n "$_OLD_VIRTUAL_PYTHONHOME"
+        set -gx PYTHONHOME $_OLD_VIRTUAL_PYTHONHOME
+        set -e _OLD_VIRTUAL_PYTHONHOME
+    end
+
+    if test -n "$_OLD_FISH_PROMPT_OVERRIDE"
+        functions -e fish_prompt
+        set -e _OLD_FISH_PROMPT_OVERRIDE
+        functions -c _old_fish_prompt fish_prompt
+        functions -e _old_fish_prompt
+    end
+
+    set -e VIRTUAL_ENV
+    if test "$argv[1]" != "nondestructive"
+        # Self destruct!
+        functions -e deactivate
+    end
+end
+
+# unset irrelevant variables
+deactivate nondestructive
+
+set -gx VIRTUAL_ENV "/Users/lizhubo/workspace/alf/.my_venv"
+
+set -gx _OLD_VIRTUAL_PATH $PATH
+set -gx PATH "$VIRTUAL_ENV/bin" $PATH
+
+# unset PYTHONHOME if set
+if set -q PYTHONHOME
+    set -gx _OLD_VIRTUAL_PYTHONHOME $PYTHONHOME
+    set -e PYTHONHOME
+end
+
+if test -z "$VIRTUAL_ENV_DISABLE_PROMPT"
+    # fish uses a function instead of an env var to generate the prompt.
+
+    # save the current fish_prompt function as the function _old_fish_prompt
+    functions -c fish_prompt _old_fish_prompt
+
+    # with the original prompt function renamed, we can override with our own.
+    function fish_prompt
+        # Save the return status of the last command
+        set -l old_status $status
+
+        # Prompt override?
+        if test -n "(.my_venv) "
+            printf "%s%s" "(.my_venv) " (set_color normal)
+        else
+            # ...Otherwise, prepend env
+            set -l _checkbase (basename "$VIRTUAL_ENV")
+            if test $_checkbase = "__"
+                # special case for Aspen magic directories
+                # see http://www.zetadev.com/software/aspen/
+                printf "%s[%s]%s " (set_color -b blue white) (basename (dirname "$VIRTUAL_ENV")) (set_color normal)
+            else
+                printf "%s(%s)%s" (set_color -b blue white) (basename "$VIRTUAL_ENV") (set_color normal)
+            end
+        end
+
+        # Restore the return status of the previous command.
+        echo "exit $old_status" | .
+        _old_fish_prompt
+    end
+
+    set -gx _OLD_FISH_PROMPT_OVERRIDE "$VIRTUAL_ENV"
+end
diff --git a/.my_venv/bin/easy_install b/.my_venv/bin/easy_install
new file mode 100755
index 000000000..f630a2891
--- /dev/null
+++ b/.my_venv/bin/easy_install
@@ -0,0 +1,10 @@
+#!/Users/lizhubo/workspace/alf/.my_venv/bin/python
+# -*- coding: utf-8 -*-
+import re
+import sys
+
+from setuptools.command.easy_install import main
+
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
+    sys.exit(main())
diff --git a/.my_venv/bin/easy_install-3.7 b/.my_venv/bin/easy_install-3.7
new file mode 100755
index 000000000..f630a2891
--- /dev/null
+++ b/.my_venv/bin/easy_install-3.7
@@ -0,0 +1,10 @@
+#!/Users/lizhubo/workspace/alf/.my_venv/bin/python
+# -*- coding: utf-8 -*-
+import re
+import sys
+
+from setuptools.command.easy_install import main
+
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
+    sys.exit(main())
diff --git a/.my_venv/bin/pip b/.my_venv/bin/pip
new file mode 100755
index 000000000..f51e24700
--- /dev/null
+++ b/.my_venv/bin/pip
@@ -0,0 +1,10 @@
+#!/Users/lizhubo/workspace/alf/.my_venv/bin/python
+# -*- coding: utf-8 -*-
+import re
+import sys
+
+from pip._internal import main
+
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
+    sys.exit(main())
diff --git a/.my_venv/bin/pip3 b/.my_venv/bin/pip3
new file mode 100755
index 000000000..f51e24700
--- /dev/null
+++ b/.my_venv/bin/pip3
@@ -0,0 +1,10 @@
+#!/Users/lizhubo/workspace/alf/.my_venv/bin/python
+# -*- coding: utf-8 -*-
+import re
+import sys
+
+from pip._internal import main
+
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
+    sys.exit(main())
diff --git a/.my_venv/bin/pip3.7 b/.my_venv/bin/pip3.7
new file mode 100755
index 000000000..f51e24700
--- /dev/null
+++ b/.my_venv/bin/pip3.7
@@ -0,0 +1,10 @@
+#!/Users/lizhubo/workspace/alf/.my_venv/bin/python
+# -*- coding: utf-8 -*-
+import re
+import sys
+
+from pip._internal import main
+
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
+    sys.exit(main())
diff --git a/.my_venv/bin/python b/.my_venv/bin/python
new file mode 120000
index 000000000..3381f8782
--- /dev/null
+++ b/.my_venv/bin/python
@@ -0,0 +1 @@
+/opt/anaconda3/bin/python
\ No newline at end of file
diff --git a/.my_venv/bin/python3 b/.my_venv/bin/python3
new file mode 120000
index 000000000..d8654aa0e
--- /dev/null
+++ b/.my_venv/bin/python3
@@ -0,0 +1 @@
+python
\ No newline at end of file
diff --git a/.my_venv/pyvenv.cfg b/.my_venv/pyvenv.cfg
new file mode 100644
index 000000000..caaaab722
--- /dev/null
+++ b/.my_venv/pyvenv.cfg
@@ -0,0 +1,3 @@
+home = /opt/anaconda3/bin
+include-system-site-packages = false
+version = 3.7.6
diff --git a/alf/.vscode/launch.json b/alf/.vscode/launch.json
new file mode 100644
index 000000000..0024ede62
--- /dev/null
+++ b/alf/.vscode/launch.json
@@ -0,0 +1,22 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "train",
+            "type": "python",
+            "request": "launch",
+            "cwd": "/Users/lizhubo/workspace/alf/alf/examples",
+            "program": "/Users/lizhubo/workspace/alf/alf/bin/train.py",
+            "env": {"DISPLAY": ":8",
+                    "CUDA_VISIBLE_DEVICES": "0"},
+            "args": [
+                "--root_dir=/Users/lizhubo/Desktop/Pytorch/Results50",
+                "--gin_file=/Users/lizhubo/workspace/alf/alf/examples/sarsa_sac_pendulum.gin",
+            ],
+           
+        }
+    ]
+}
diff --git a/alf/algorithms/actor_critic_loss.py b/alf/algorithms/actor_critic_loss.py
index 1a28727b2..2092dfe51 100644
--- a/alf/algorithms/actor_critic_loss.py
+++ b/alf/algorithms/actor_critic_loss.py
@@ -169,7 +169,10 @@ def _calc_returns_and_advantages(self, experience, value):
                 values=value,
                 step_types=experience.step_type,
                 discounts=experience.discount * self._gamma,
-                td_lambda=self._lambda)
+                target_value=value,
+                td_lambda=self._lambda,
+                importance_ratio=1.0,
+                use_retrace=False)
             advantages = tensor_utils.tensor_extend_zero(advantages)
             if self._use_td_lambda_return:
                 returns = advantages + value
diff --git a/alf/algorithms/ppo_algorithm.py b/alf/algorithms/ppo_algorithm.py
index 72858d7b3..abdb59954 100644
--- a/alf/algorithms/ppo_algorithm.py
+++ b/alf/algorithms/ppo_algorithm.py
@@ -46,7 +46,10 @@ def preprocess_experience(self, exp: Experience):
             values=exp.rollout_info.value,
             step_types=exp.step_type,
             discounts=exp.discount * self._loss._gamma,
+            target_value=exp.rollout_info.value,
             td_lambda=self._loss._lambda,
+            importance_ratio=1.0,
+            use_retrace=False,
             time_major=False)
         advantages = torch.cat([
             advantages,
diff --git a/alf/algorithms/td_loss.py b/alf/algorithms/td_loss.py
index 44b6379ef..49b96cf06 100644
--- a/alf/algorithms/td_loss.py
+++ b/alf/algorithms/td_loss.py
@@ -99,7 +99,7 @@ def forward(self, experience, value, target_value, train_info):
                 can be same as ``value``.
             train_info : train_info includes action distrbution, actor, critic and
                 other information. Different algorithm may have different info inside.
-                For the retrace method, we can use SarsaInfo, SacInfo or DdpgInfo as train_info 
+                For the retrace method, we can use SarsaInfo, SacInfo or DdpgInfo as train_info
                 for Sac, Sarsa or Ddpg algorithm. Adding train_info to calculate importance_ratio
                 and importance_ratio_clipped.               
         Returns:
@@ -118,10 +118,26 @@ def forward(self, experience, value, target_value, train_info):
                 step_types=experience.step_type,
                 discounts=experience.discount * self._gamma)
         elif self._use_retrace == False:
+            scope = alf.summary.scope(self.__class__.__name__)
+            importance_ratio, importance_ratio_clipped = value_ops. \
+            action_importance_ratio(
+                action_distribution=train_info.action_distribution,
+                collect_action_distribution=experience.rollout_info.
+                action_distribution,
+                action=experience.action,
+                clipping_mode='capping',
+                importance_ratio_clipping=0.0,
+                log_prob_clipping=0.0,
+                scope=scope,
+                check_numerics=False,
+                debug_summaries=self._debug_summaries)
             advantages = value_ops.generalized_advantage_estimation(
                 rewards=experience.reward,
                 values=target_value,
                 step_types=experience.step_type,
+                target_value=target_value,
+                importance_ratio=importance_ratio,
+                use_retrace=False,
                 discounts=experience.discount * self._gamma,
                 td_lambda=self._lambda)
             returns = advantages + target_value[:-1]
@@ -139,15 +155,17 @@ def forward(self, experience, value, target_value, train_info):
                 scope=scope,
                 check_numerics=False,
                 debug_summaries=self._debug_summaries)
-            advantages = value_ops.generalized_advantage_estimation_retrace(
+            advantages = value_ops.generalized_advantage_estimation(
                 importance_ratio=importance_ratio_clipped,
                 rewards=experience.reward,
                 values=value,
                 target_value=target_value,
                 step_types=experience.step_type,
                 discounts=experience.discount * self._gamma,
+                use_retrace=True,
                 time_major=True,
                 td_lambda=self._lambda)
+
             returns = advantages + value[:-1]
             returns = returns.detach()
         value = value[:-1]
diff --git a/alf/examples/carla.gin b/alf/examples/carla.gin
index 39a1d1f66..e42a2c8b7 100644
--- a/alf/examples/carla.gin
+++ b/alf/examples/carla.gin
@@ -4,8 +4,8 @@ import alf
 import alf.algorithms.merlin_algorithm
 import alf.environments.suite_carla
 
-CameraSensor.image_size_x=200
-CameraSensor.image_size_y=100
+CameraSensor.image_size_x=128
+CameraSensor.image_size_y=64
 CameraSensor.fov=135
 
 create_environment.env_name='Town01'
diff --git a/alf/utils/value_ops.py b/alf/utils/value_ops.py
index 66e685027..583c14954 100644
--- a/alf/utils/value_ops.py
+++ b/alf/utils/value_ops.py
@@ -195,6 +195,9 @@ def generalized_advantage_estimation(rewards,
                                      values,
                                      step_types,
                                      discounts,
+                                     target_value,
+                                     importance_ratio,
+                                     use_retrace=False,
                                      td_lambda=1.0,
                                      time_major=True):
     """Computes generalized advantage estimation (GAE) for the first T-1 steps.
@@ -231,6 +234,8 @@ def generalized_advantage_estimation(rewards,
         rewards = rewards.transpose(0, 1)
         values = values.transpose(0, 1)
         step_types = step_types.transpose(0, 1)
+        importance_ratio = importance_ratio.transpose(0, 1)
+        target_value = target_value.transpose(0, 1)
 
     assert values.shape[0] >= 2, ("The sequence length needs to be "
                                   "at least 2. Got {s}".format(
@@ -240,16 +245,23 @@ def generalized_advantage_estimation(rewards,
     is_lasts = common.expand_dims_as(is_lasts, values)
     discounts = common.expand_dims_as(discounts, values)
 
-    weighted_discounts = discounts[1:] * td_lambda
-
     advs = torch.zeros_like(values)
-    delta = rewards[1:] + discounts[1:] * values[1:] - values[:-1]
-
-    with torch.no_grad():
-        for t in reversed(range(rewards.shape[0] - 1)):
-            advs[t] = (1 - is_lasts[t]) * \
-                      (delta[t] + weighted_discounts[t] * advs[t + 1])
-        advs = advs[:-1]
+    if use_retrace == False:
+        weighted_discounts = discounts[1:] * td_lambda
+        delta = rewards[1:] + discounts[1:] * values[1:] - values[:-1]
+        with torch.no_grad():
+            for t in reversed(range(rewards.shape[0] - 1)):
+                advs[t] = (1 - is_lasts[t]) * \
+                        (delta[t] + weighted_discounts[t] * advs[t + 1])
+            advs = advs[:-1]
+    else:
+        delta = (rewards[1:] + discounts[1:] * target_value[1:] - values[:-1])
+        weighted_discounts = discounts[1:] * td_lambda * importance_ratio
+        with torch.no_grad():
+            for t in reversed(range(rewards.shape[0] - 1)):
+                advs[t] = (1 - is_lasts[t]) * \
+                        (delta[t] + weighted_discounts[t] * advs[t + 1])
+            advs = advs[:-1]
 
     if not time_major:
         advs = advs.transpose(0, 1)
@@ -257,6 +269,7 @@ def generalized_advantage_estimation(rewards,
     return advs.detach()
 
 
+'''
 # add for the retrace method
 def generalized_advantage_estimation_retrace(importance_ratio, discounts,
                                              rewards, td_lambda, time_major,
@@ -304,3 +317,4 @@ def generalized_advantage_estimation_retrace(importance_ratio, discounts,
         advs = advs.transpose(0, 1)
 
     return advs.detach()
+'''
diff --git a/alf/utils/value_ops_test.py b/alf/utils/value_ops_test.py
index 106f0d58e..cd5d19f39 100644
--- a/alf/utils/value_ops_test.py
+++ b/alf/utils/value_ops_test.py
@@ -96,14 +96,17 @@ class GeneralizedAdvantageTest(unittest.TestCase):
     """Tests for alf.utils.value_ops.generalized_advantage_estimation
     """
 
-    def _check(self, rewards, values, step_types, discounts, td_lambda,
-               expected):
+    def _check(self, rewards, values, step_types, discounts, target_value,
+               importance_ratio, use_retrace, td_lambda, expected):
         np.testing.assert_array_almost_equal(
             value_ops.generalized_advantage_estimation(
                 rewards=rewards,
                 values=values,
                 step_types=step_types,
                 discounts=discounts,
+                target_value=target_value,
+                importance_ratio=importance_ratio,
+                use_retrace=use_retrace,
                 td_lambda=td_lambda,
                 time_major=False), expected)
 
@@ -113,6 +116,9 @@ def _check(self, rewards, values, step_types, discounts, td_lambda,
                 values=torch.stack([values, 2 * values], dim=2),
                 step_types=step_types,
                 discounts=discounts,
+                importance_ratio=importance_ratio,
+                target_value=target_value,
+                use_retrace=use_retrace,
                 td_lambda=td_lambda,
                 time_major=False),
             torch.stack([expected, 2 * expected], dim=2),
@@ -124,7 +130,9 @@ def test_generalized_advantage_estimation(self):
         rewards = torch.tensor([[3.] * 5], dtype=torch.float32)
         discounts = torch.tensor([[0.9] * 5], dtype=torch.float32)
         td_lambda = 0.6 / 0.9
-
+        target_value = torch.tensor([[3.] * 4], dtype=torch.float32)
+        importance_ratio = torch.tensor([[0.8] * 3], dtype=torch.float32)
+        use_retrace = False
         d = 2 * 0.9 + 1
         expected = torch.tensor([[((d * 0.6 + d) * 0.6 + d) * 0.6 + d,
                                   (d * 0.6 + d) * 0.6 + d, d * 0.6 + d, d]],
@@ -134,7 +142,10 @@ def test_generalized_advantage_estimation(self):
             values=values,
             step_types=step_types,
             discounts=discounts,
+            importance_ratio=importance_ratio,
+            target_value=target_value,
             td_lambda=td_lambda,
+            use_retrace=use_retrace,
             expected=expected)
 
         # two episodes, and exceed by time limit (discount=1)
@@ -150,7 +161,10 @@ def test_generalized_advantage_estimation(self):
             values=values,
             step_types=step_types,
             discounts=discounts,
+            importance_ratio=importance_ratio,
+            target_value=target_value,
             td_lambda=td_lambda,
+            use_retrace=use_retrace,
             expected=expected)
 
         # tow episodes, and end normal (discount=0)
@@ -169,12 +183,16 @@ def test_generalized_advantage_estimation(self):
             step_types=step_types,
             discounts=discounts,
             td_lambda=td_lambda,
+            importance_ratio=importance_ratio,
+            target_value=target_value,
+            use_retrace=use_retrace,
             expected=expected)
 
 
+'''
 class GeneralizedAdvantage_retrace_Test(unittest.TestCase):
     """Tests for alf.utils.value_ops
-    """
+    """GeneralizedAdvantageTest.test_generalized_advantage_estimation()
 
     def test_generalized_advantage_estimation_retrace(self):
         values = torch.tensor([[2.] * 4], dtype=torch.float32)
@@ -199,7 +217,7 @@ def test_generalized_advantage_estimation_retrace(self):
                 td_lambda=td_lambda,
                 importance_ratio=importance_ratio,
                 time_major=False), expected)
-
+'''
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/setup.py b/setup.py
index 579b90dbb..a8f3466ea 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@
     install_requires=[
         'atari_py == 0.1.7',
         'cpplint',
-        'clang-format == 9.0',
+        #'clang-format == 9.0',
         'fasteners',
         'gin-config@git+https://github.com/HorizonRobotics/gin-config.git',
         'gym == 0.12.5',

From 24661264ab83fbd725d4c35962a55c72418e3536 Mon Sep 17 00:00:00 2001
From: zhuboli <55901904+zhuboli@users.noreply.github.com>
Date: Thu, 26 Nov 2020 08:05:28 +0800
Subject: [PATCH 9/9] merge function and fix bug

---
 .my_venv/bin/activate         | 76 -----------------------------------
 .my_venv/bin/activate.csh     | 37 -----------------
 .my_venv/bin/activate.fish    | 75 ----------------------------------
 .my_venv/bin/easy_install     | 10 -----
 .my_venv/bin/easy_install-3.7 | 10 -----
 .my_venv/bin/pip              | 10 -----
 .my_venv/bin/pip3             | 10 -----
 .my_venv/bin/pip3.7           | 10 -----
 .my_venv/bin/python           |  1 -
 .my_venv/bin/python3          |  1 -
 .my_venv/pyvenv.cfg           |  3 --
 alf/.vscode/launch.json       | 22 ----------
 12 files changed, 265 deletions(-)
 delete mode 100644 .my_venv/bin/activate
 delete mode 100644 .my_venv/bin/activate.csh
 delete mode 100644 .my_venv/bin/activate.fish
 delete mode 100755 .my_venv/bin/easy_install
 delete mode 100755 .my_venv/bin/easy_install-3.7
 delete mode 100755 .my_venv/bin/pip
 delete mode 100755 .my_venv/bin/pip3
 delete mode 100755 .my_venv/bin/pip3.7
 delete mode 120000 .my_venv/bin/python
 delete mode 120000 .my_venv/bin/python3
 delete mode 100644 .my_venv/pyvenv.cfg
 delete mode 100644 alf/.vscode/launch.json

diff --git a/.my_venv/bin/activate b/.my_venv/bin/activate
deleted file mode 100644
index 1db2cc311..000000000
--- a/.my_venv/bin/activate
+++ /dev/null
@@ -1,76 +0,0 @@
-# This file must be used with "source bin/activate" *from bash*
-# you cannot run it directly
-
-deactivate () {
-    # reset old environment variables
-    if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then
-        PATH="${_OLD_VIRTUAL_PATH:-}"
-        export PATH
-        unset _OLD_VIRTUAL_PATH
-    fi
-    if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then
-        PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}"
-        export PYTHONHOME
-        unset _OLD_VIRTUAL_PYTHONHOME
-    fi
-
-    # This should detect bash and zsh, which have a hash command that must
-    # be called to get it to forget past commands.  Without forgetting
-    # past commands the $PATH changes we made may not be respected
-    if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then
-        hash -r
-    fi
-
-    if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then
-        PS1="${_OLD_VIRTUAL_PS1:-}"
-        export PS1
-        unset _OLD_VIRTUAL_PS1
-    fi
-
-    unset VIRTUAL_ENV
-    if [ ! "${1:-}" = "nondestructive" ] ; then
-    # Self destruct!
-        unset -f deactivate
-    fi
-}
-
-# unset irrelevant variables
-deactivate nondestructive
-
-VIRTUAL_ENV="/Users/lizhubo/workspace/alf/.my_venv"
-export VIRTUAL_ENV
-
-_OLD_VIRTUAL_PATH="$PATH"
-PATH="$VIRTUAL_ENV/bin:$PATH"
-export PATH
-
-# unset PYTHONHOME if set
-# this will fail if PYTHONHOME is set to the empty string (which is bad anyway)
-# could use `if (set -u; : $PYTHONHOME) ;` in bash
-if [ -n "${PYTHONHOME:-}" ] ; then
-    _OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}"
-    unset PYTHONHOME
-fi
-
-if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then
-    _OLD_VIRTUAL_PS1="${PS1:-}"
-    if [ "x(.my_venv) " != x ] ; then
-	PS1="(.my_venv) ${PS1:-}"
-    else
-    if [ "`basename \"$VIRTUAL_ENV\"`" = "__" ] ; then
-        # special case for Aspen magic directories
-        # see http://www.zetadev.com/software/aspen/
-        PS1="[`basename \`dirname \"$VIRTUAL_ENV\"\``] $PS1"
-    else
-        PS1="(`basename \"$VIRTUAL_ENV\"`)$PS1"
-    fi
-    fi
-    export PS1
-fi
-
-# This should detect bash and zsh, which have a hash command that must
-# be called to get it to forget past commands.  Without forgetting
-# past commands the $PATH changes we made may not be respected
-if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then
-    hash -r
-fi
diff --git a/.my_venv/bin/activate.csh b/.my_venv/bin/activate.csh
deleted file mode 100644
index ec285bc16..000000000
--- a/.my_venv/bin/activate.csh
+++ /dev/null
@@ -1,37 +0,0 @@
-# This file must be used with "source bin/activate.csh" *from csh*.
-# You cannot run it directly.
-# Created by Davide Di Blasi <davidedb@gmail.com>.
-# Ported to Python 3.3 venv by Andrew Svetlov <andrew.svetlov@gmail.com>
-
-alias deactivate 'test $?_OLD_VIRTUAL_PATH != 0 && setenv PATH "$_OLD_VIRTUAL_PATH" && unset _OLD_VIRTUAL_PATH; rehash; test $?_OLD_VIRTUAL_PROMPT != 0 && set prompt="$_OLD_VIRTUAL_PROMPT" && unset _OLD_VIRTUAL_PROMPT; unsetenv VIRTUAL_ENV; test "\!:*" != "nondestructive" && unalias deactivate'
-
-# Unset irrelevant variables.
-deactivate nondestructive
-
-setenv VIRTUAL_ENV "/Users/lizhubo/workspace/alf/.my_venv"
-
-set _OLD_VIRTUAL_PATH="$PATH"
-setenv PATH "$VIRTUAL_ENV/bin:$PATH"
-
-
-set _OLD_VIRTUAL_PROMPT="$prompt"
-
-if (! "$?VIRTUAL_ENV_DISABLE_PROMPT") then
-    if (".my_venv" != "") then
-        set env_name = ".my_venv"
-    else
-        if (`basename "VIRTUAL_ENV"` == "__") then
-            # special case for Aspen magic directories
-            # see http://www.zetadev.com/software/aspen/
-            set env_name = `basename \`dirname "$VIRTUAL_ENV"\``
-        else
-            set env_name = `basename "$VIRTUAL_ENV"`
-        endif
-    endif
-    set prompt = "[$env_name] $prompt"
-    unset env_name
-endif
-
-alias pydoc python -m pydoc
-
-rehash
diff --git a/.my_venv/bin/activate.fish b/.my_venv/bin/activate.fish
deleted file mode 100644
index 54a7ffcfb..000000000
--- a/.my_venv/bin/activate.fish
+++ /dev/null
@@ -1,75 +0,0 @@
-# This file must be used with ". bin/activate.fish" *from fish* (http://fishshell.org)
-# you cannot run it directly
-
-function deactivate  -d "Exit virtualenv and return to normal shell environment"
-    # reset old environment variables
-    if test -n "$_OLD_VIRTUAL_PATH"
-        set -gx PATH $_OLD_VIRTUAL_PATH
-        set -e _OLD_VIRTUAL_PATH
-    end
-    if test -n "$_OLD_VIRTUAL_PYTHONHOME"
-        set -gx PYTHONHOME $_OLD_VIRTUAL_PYTHONHOME
-        set -e _OLD_VIRTUAL_PYTHONHOME
-    end
-
-    if test -n "$_OLD_FISH_PROMPT_OVERRIDE"
-        functions -e fish_prompt
-        set -e _OLD_FISH_PROMPT_OVERRIDE
-        functions -c _old_fish_prompt fish_prompt
-        functions -e _old_fish_prompt
-    end
-
-    set -e VIRTUAL_ENV
-    if test "$argv[1]" != "nondestructive"
-        # Self destruct!
-        functions -e deactivate
-    end
-end
-
-# unset irrelevant variables
-deactivate nondestructive
-
-set -gx VIRTUAL_ENV "/Users/lizhubo/workspace/alf/.my_venv"
-
-set -gx _OLD_VIRTUAL_PATH $PATH
-set -gx PATH "$VIRTUAL_ENV/bin" $PATH
-
-# unset PYTHONHOME if set
-if set -q PYTHONHOME
-    set -gx _OLD_VIRTUAL_PYTHONHOME $PYTHONHOME
-    set -e PYTHONHOME
-end
-
-if test -z "$VIRTUAL_ENV_DISABLE_PROMPT"
-    # fish uses a function instead of an env var to generate the prompt.
-
-    # save the current fish_prompt function as the function _old_fish_prompt
-    functions -c fish_prompt _old_fish_prompt
-
-    # with the original prompt function renamed, we can override with our own.
-    function fish_prompt
-        # Save the return status of the last command
-        set -l old_status $status
-
-        # Prompt override?
-        if test -n "(.my_venv) "
-            printf "%s%s" "(.my_venv) " (set_color normal)
-        else
-            # ...Otherwise, prepend env
-            set -l _checkbase (basename "$VIRTUAL_ENV")
-            if test $_checkbase = "__"
-                # special case for Aspen magic directories
-                # see http://www.zetadev.com/software/aspen/
-                printf "%s[%s]%s " (set_color -b blue white) (basename (dirname "$VIRTUAL_ENV")) (set_color normal)
-            else
-                printf "%s(%s)%s" (set_color -b blue white) (basename "$VIRTUAL_ENV") (set_color normal)
-            end
-        end
-
-        # Restore the return status of the previous command.
-        echo "exit $old_status" | .
-        _old_fish_prompt
-    end
-
-    set -gx _OLD_FISH_PROMPT_OVERRIDE "$VIRTUAL_ENV"
-end
diff --git a/.my_venv/bin/easy_install b/.my_venv/bin/easy_install
deleted file mode 100755
index f630a2891..000000000
--- a/.my_venv/bin/easy_install
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/Users/lizhubo/workspace/alf/.my_venv/bin/python
-# -*- coding: utf-8 -*-
-import re
-import sys
-
-from setuptools.command.easy_install import main
-
-if __name__ == '__main__':
-    sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
-    sys.exit(main())
diff --git a/.my_venv/bin/easy_install-3.7 b/.my_venv/bin/easy_install-3.7
deleted file mode 100755
index f630a2891..000000000
--- a/.my_venv/bin/easy_install-3.7
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/Users/lizhubo/workspace/alf/.my_venv/bin/python
-# -*- coding: utf-8 -*-
-import re
-import sys
-
-from setuptools.command.easy_install import main
-
-if __name__ == '__main__':
-    sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
-    sys.exit(main())
diff --git a/.my_venv/bin/pip b/.my_venv/bin/pip
deleted file mode 100755
index f51e24700..000000000
--- a/.my_venv/bin/pip
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/Users/lizhubo/workspace/alf/.my_venv/bin/python
-# -*- coding: utf-8 -*-
-import re
-import sys
-
-from pip._internal import main
-
-if __name__ == '__main__':
-    sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
-    sys.exit(main())
diff --git a/.my_venv/bin/pip3 b/.my_venv/bin/pip3
deleted file mode 100755
index f51e24700..000000000
--- a/.my_venv/bin/pip3
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/Users/lizhubo/workspace/alf/.my_venv/bin/python
-# -*- coding: utf-8 -*-
-import re
-import sys
-
-from pip._internal import main
-
-if __name__ == '__main__':
-    sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
-    sys.exit(main())
diff --git a/.my_venv/bin/pip3.7 b/.my_venv/bin/pip3.7
deleted file mode 100755
index f51e24700..000000000
--- a/.my_venv/bin/pip3.7
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/Users/lizhubo/workspace/alf/.my_venv/bin/python
-# -*- coding: utf-8 -*-
-import re
-import sys
-
-from pip._internal import main
-
-if __name__ == '__main__':
-    sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
-    sys.exit(main())
diff --git a/.my_venv/bin/python b/.my_venv/bin/python
deleted file mode 120000
index 3381f8782..000000000
--- a/.my_venv/bin/python
+++ /dev/null
@@ -1 +0,0 @@
-/opt/anaconda3/bin/python
\ No newline at end of file
diff --git a/.my_venv/bin/python3 b/.my_venv/bin/python3
deleted file mode 120000
index d8654aa0e..000000000
--- a/.my_venv/bin/python3
+++ /dev/null
@@ -1 +0,0 @@
-python
\ No newline at end of file
diff --git a/.my_venv/pyvenv.cfg b/.my_venv/pyvenv.cfg
deleted file mode 100644
index caaaab722..000000000
--- a/.my_venv/pyvenv.cfg
+++ /dev/null
@@ -1,3 +0,0 @@
-home = /opt/anaconda3/bin
-include-system-site-packages = false
-version = 3.7.6
diff --git a/alf/.vscode/launch.json b/alf/.vscode/launch.json
deleted file mode 100644
index 0024ede62..000000000
--- a/alf/.vscode/launch.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-    // Use IntelliSense to learn about possible attributes.
-    // Hover to view descriptions of existing attributes.
-    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
-    "version": "0.2.0",
-    "configurations": [
-        {
-            "name": "train",
-            "type": "python",
-            "request": "launch",
-            "cwd": "/Users/lizhubo/workspace/alf/alf/examples",
-            "program": "/Users/lizhubo/workspace/alf/alf/bin/train.py",
-            "env": {"DISPLAY": ":8",
-                    "CUDA_VISIBLE_DEVICES": "0"},
-            "args": [
-                "--root_dir=/Users/lizhubo/Desktop/Pytorch/Results50",
-                "--gin_file=/Users/lizhubo/workspace/alf/alf/examples/sarsa_sac_pendulum.gin",
-            ],
-           
-        }
-    ]
-}