From edde2da1087bcb2b32a33faa7aca54745f7474ea Mon Sep 17 00:00:00 2001 From: zhuboli <55901904+zhuboli@users.noreply.github.com> Date: Tue, 29 Sep 2020 15:16:14 -0700 Subject: [PATCH 1/9] draft_retrace --- alf/algorithms/td_loss.py | 31 ++++++++++++++++++++++++++++--- alf/utils/value_ops.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 3 deletions(-) diff --git a/alf/algorithms/td_loss.py b/alf/algorithms/td_loss.py index b72dc592d..6e97a9100 100644 --- a/alf/algorithms/td_loss.py +++ b/alf/algorithms/td_loss.py @@ -70,13 +70,16 @@ def __init__(self, self._lambda = td_lambda self._debug_summaries = debug_summaries - def forward(self, experience, value, target_value): + def forward(self, experience, value, target_value,train_info = None): """Cacluate the loss. The first dimension of all the tensors is time dimension and the second dimesion is the batch dimension. Args: + train_info (sac_info or sarsa_info): in order to calculate the importance ratio + from info.action_distribution. If no input of train info and lambda is not + 0 and 1,it will use multistep method instead of retrace experience (Experience): experience collected from ``unroll()`` or a replay buffer. All tensors are time-major. value (torch.Tensor): the time-major tensor for the value at each time @@ -99,7 +102,7 @@ def forward(self, experience, value, target_value): values=target_value, step_types=experience.step_type, discounts=experience.discount * self._gamma) - else: + elif train_info == None: advantages = value_ops.generalized_advantage_estimation( rewards=experience.reward, values=target_value, @@ -107,7 +110,29 @@ def forward(self, experience, value, target_value): discounts=experience.discount * self._gamma, td_lambda=self._lambda) returns = advantages + target_value[:-1] - + else: + scope = alf.summary.scope(self.__class__.__name__) + importance_ratio,importance_ratio_clipped = value_ops.action_importance_ratio( + action_distribution=train_info.action_distribution, + collect_action_distribution=experience.rollout_info.action_distribution, + action=experience.action, + clipping_mode='capping', + importance_ratio_clipping= 0.0, + log_prob_clipping= 0.0, + scope=scope, + check_numerics=False, + debug_summaries=True) + advantages = value_ops.generalized_advantage_estimation_retrace( + importance_ratio = importance_ratio_clipped, + rewards=experience.reward, + values= value, + target_value = target_value, + step_types=experience.step_type, + discounts=experience.discount * self._gamma, + time_major = True, + td_lambda=self._lambda) + returns = advantages + value[:-1] + returns = returns.detach() value = value[:-1] if self._debug_summaries and alf.summary.should_record_summaries(): diff --git a/alf/utils/value_ops.py b/alf/utils/value_ops.py index a6bf85a23..c41502a40 100644 --- a/alf/utils/value_ops.py +++ b/alf/utils/value_ops.py @@ -255,3 +255,36 @@ def generalized_advantage_estimation(rewards, advs = advs.transpose(0, 1) return advs.detach() +####### add for the retrace method +def generalized_advantage_estimation_retrace(importance_ratio, discounts, rewards, td_lambda, time_major, values, target_value,step_types): + ############## compare the importance_ratio with 1 + #importance_ratio = torch.min(importance_ratio, torch.tensor(1.)) + ##### why we need this time_major, just sample distuibution? + if not time_major: + discounts = discounts.transpose(0, 1) + rewards = rewards.transpose(0, 1) + values = values.transpose(0, 1) + step_types = step_types.transpose(0, 1) + importance_ratio = importance_ratio.transpose(0,1) + target_value = target_value.transpose(0,1) + + assert values.shape[0] >= 2, ("The sequence length needs to be " + "at least 2. Got {s}".format( + s=values.shape[0])) + + #### calcuate the loss not very clear for this function + advs = torch.zeros_like(values) + is_lasts = (step_types == StepType.LAST).to(dtype=torch.float32) + delta = (rewards[1:] + discounts[1:] * target_value[1:] - values[:-1]) + + + weighted_discounts = discounts[1:] * td_lambda * importance_ratio + with torch.no_grad(): + for t in reversed(range(rewards.shape[0] - 1)): + advs[t] = (1 - is_lasts[t]) * \ + (delta[t] + weighted_discounts[t] * advs[t + 1]) + advs = advs[:-1] + if not time_major: + advs = advs.transpose(0, 1) + + return advs.detach() \ No newline at end of file From 394a39aee6846ecb59c9b0805fe1347a00dfe179 Mon Sep 17 00:00:00 2001 From: zhuboli <55901904+zhuboli@users.noreply.github.com> Date: Thu, 8 Oct 2020 15:31:29 -0700 Subject: [PATCH 2/9] fix retrace --- alf/algorithms/ddpg_algorithm.py | 3 +- alf/algorithms/sac_algorithm.py | 3 +- alf/algorithms/sarsa_algorithm.py | 2 +- alf/algorithms/td_loss.py | 168 ++++++++++++++++++++++++++++++ alf/utils/value_ops.py | 5 +- alf/utils/value_ops_test.py | 25 +++++ 6 files changed, 199 insertions(+), 7 deletions(-) diff --git a/alf/algorithms/ddpg_algorithm.py b/alf/algorithms/ddpg_algorithm.py index 6c5ca7284..98ed6a7cc 100644 --- a/alf/algorithms/ddpg_algorithm.py +++ b/alf/algorithms/ddpg_algorithm.py @@ -326,7 +326,8 @@ def calc_loss(self, experience, train_info: DdpgInfo): critic_losses[i] = self._critic_losses[i]( experience=experience, value=train_info.critic.q_values[:, :, i, ...], - target_value=train_info.critic.target_q_values).loss + target_value=train_info.critic.target_q_values, + train_info = train_info).loss critic_loss = math_ops.add_n(critic_losses) diff --git a/alf/algorithms/sac_algorithm.py b/alf/algorithms/sac_algorithm.py index 65633d297..de44c4cf3 100644 --- a/alf/algorithms/sac_algorithm.py +++ b/alf/algorithms/sac_algorithm.py @@ -757,7 +757,8 @@ def _calc_critic_loss(self, experience, train_info: SacInfo): critic_losses.append( l(experience=experience, value=critic_info.critics[:, :, i, ...], - target_value=critic_info.target_critic).loss) + target_value=critic_info.target_critic, + train_info = train_info).loss) critic_loss = math_ops.add_n(critic_losses) diff --git a/alf/algorithms/sarsa_algorithm.py b/alf/algorithms/sarsa_algorithm.py index 86d07a74f..7c22fcb1b 100644 --- a/alf/algorithms/sarsa_algorithm.py +++ b/alf/algorithms/sarsa_algorithm.py @@ -435,7 +435,7 @@ def calc_loss(self, experience, info: SarsaInfo): target_critic = tensor_utils.tensor_prepend_zero( info.target_critics) loss_info = self._critic_losses[i](shifted_experience, critic, - target_critic) + target_critic,info) critic_losses.append(nest_map(lambda l: l[:-1], loss_info.loss)) critic_loss = math_ops.add_n(critic_losses) diff --git a/alf/algorithms/td_loss.py b/alf/algorithms/td_loss.py index 6e97a9100..3e3289f52 100644 --- a/alf/algorithms/td_loss.py +++ b/alf/algorithms/td_loss.py @@ -8,6 +8,16 @@ # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, # Copyright (c) 2019 Horizon Robotics. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. @@ -23,6 +33,164 @@ from alf.utils.summary_utils import safe_mean_hist_summary +@gin.configurable +class TDLoss(nn.Module): + def __init__(self, + gamma=0.99, + td_error_loss_fn=element_wise_squared_loss, + td_lambda=0.95, + use_retrace=0, + debug_summaries=False, + name="TDLoss"): + r"""Create a TDLoss object. + + Let :math:`G_{t:T}` be the bootstaped return from t to T: + :math:`G_{t:T} = \sum_{i=t+1}^T \gamma^{t-i-1}R_i + \gamma^{T-t} V(s_T)` + If ``td_lambda`` = 1, the target for step t is :math:`G_{t:T}`. + If ``td_lambda`` = 0, the target for step t is :math:`G_{t:t+1}` + If 0 < ``td_lambda`` < 1, the target for step t is the :math:`\lambda`-return: + :math:`G_t^\lambda = (1 - \lambda) \sum_{i=t+1}^{T-1} \lambda^{i-t}G_{t:i} + \lambda^{T-t-1} G_{t:T}` + There is a simple relationship between :math:`\lambda`-return and + the generalized advantage estimation :math:`\hat{A}^{GAE}_t`: + :math:`G_t^\lambda = \hat{A}^{GAE}_t + V(s_t)` + where the generalized advantage estimation is defined as: + :math:`\hat{A}^{GAE}_t = \sum_{i=t}^{T-1}(\gamma\lambda)^{i-t}(R_{i+1} + \gamma V(s_{i+1}) - V(s_i))` + use_retrace = 0 means one step or multi_step loss, use_retrace = 1 means retrace loss + References: + + Schulman et al. `High-Dimensional Continuous Control Using Generalized Advantage Estimation + `_ + + Sutton et al. `Reinforcement Learning: An Introduction + `_, Chapter 12, 2018 + + Args: + gamma (float): A discount factor for future rewards. + td_errors_loss_fn (Callable): A function for computing the TD errors + loss. This function takes as input the target and the estimated + Q values and returns the loss for each element of the batch. + td_lambda (float): Lambda parameter for TD-lambda computation. + debug_summaries (bool): True if debug summaries should be created. + name (str): The name of this loss. + """ + super().__init__() + + self._name = name + self._gamma = gamma + self._td_error_loss_fn = td_error_loss_fn + self._lambda = td_lambda + self._debug_summaries = debug_summaries + self._use_retrace = use_retrace + def forward(self, experience, value, target_value, train_info): + """Cacluate the loss. + + The first dimension of all the tensors is time dimension and the second + dimesion is the batch dimension. + + Args: + experience (Experience): experience collected from ``unroll()`` or + a replay buffer. All tensors are time-major. + value (torch.Tensor): the time-major tensor for the value at each time + step. The loss is between this and the calculated return. + target_value (torch.Tensor): the time-major tensor for the value at + each time step. This is used to calculate return. ``target_value`` + can be same as ``value``. + train_info (sarsa info, sac info): information used to calcuate importance_ratio + or importance_ratio_clipped + Returns: + LossInfo: with the ``extra`` field same as ``loss``. + """ + if self._lambda == 1.0: + returns = value_ops.discounted_return( + rewards=experience.reward, + values=target_value, + step_types=experience.step_type, + discounts=experience.discount * self._gamma) + elif self._lambda == 0.0: + returns = value_ops.one_step_discounted_return( + rewards=experience.reward, + values=target_value, + step_types=experience.step_type, + discounts=experience.discount * self._gamma) + elif self._use_retrace == 0: + advantages = value_ops.generalized_advantage_estimation( + rewards=experience.reward, + values=target_value, + step_types=experience.step_type, + discounts=experience.discount * self._gamma, + td_lambda=self._lambda) + returns = advantages + target_value[:-1] + else: + scope = alf.summary.scope(self.__class__.__name__) + importance_ratio,importance_ratio_clipped = value_ops.action_importance_ratio( + action_distribution=train_info.action_distribution, + collect_action_distribution=experience.rollout_info.action_distribution, + action=experience.action, + clipping_mode='capping', + importance_ratio_clipping= 0.0, + log_prob_clipping= 0.0, + scope=scope, + check_numerics=False, + debug_summaries=True) + advantages = value_ops.generalized_advantage_estimation_retrace( + importance_ratio = importance_ratio_clipped, + rewards=experience.reward, + values= value, + target_value = target_value, + step_types=experience.step_type, + discounts=experience.discount * self._gamma, + time_major = True, + td_lambda=self._lambda) + returns = advantages + value[:-1] + returns = returns.detach() + value = value[:-1] + + if self._debug_summaries and alf.summary.should_record_summaries(): + mask = experience.step_type[:-1] != StepType.LAST + with alf.summary.scope(self._name): + + def _summarize(v, r, td, suffix): + alf.summary.scalar( + "explained_variance_of_return_by_value" + suffix, + tensor_utils.explained_variance(v, r, mask)) + safe_mean_hist_summary('values' + suffix, v, mask) + safe_mean_hist_summary('returns' + suffix, r, mask) + safe_mean_hist_summary("td_error" + suffix, td, mask) + + if value.ndim == 2: + _summarize(value, returns, returns - value, '') + else: + td = returns - value + for i in range(value.shape[2]): + suffix = '/' + str(i) + _summarize(value[..., i], returns[..., i], td[..., i], + suffix) + + loss = self._td_error_loss_fn(returns.detach(), value) + + if loss.ndim == 3: + # Multidimensional reward. Average over the critic loss for all dimensions + loss = loss.mean(dim=2) + + # The shape of the loss expected by Algorith.update_with_gradient is + # [T, B], so we need to augment it with additional zeros. + loss = tensor_utils.tensor_extend_zero(loss) + return LossInfo(loss=loss, extra=loss) +either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gin +import torch +import torch.nn as nn + +import alf +from alf.data_structures import LossInfo, StepType +from alf.utils.losses import element_wise_squared_loss +from alf.utils import tensor_utils, value_ops +from alf.utils.summary_utils import safe_mean_hist_summary + + @gin.configurable class TDLoss(nn.Module): def __init__(self, diff --git a/alf/utils/value_ops.py b/alf/utils/value_ops.py index c41502a40..a93d2cb4d 100644 --- a/alf/utils/value_ops.py +++ b/alf/utils/value_ops.py @@ -255,11 +255,10 @@ def generalized_advantage_estimation(rewards, advs = advs.transpose(0, 1) return advs.detach() + ####### add for the retrace method def generalized_advantage_estimation_retrace(importance_ratio, discounts, rewards, td_lambda, time_major, values, target_value,step_types): - ############## compare the importance_ratio with 1 #importance_ratio = torch.min(importance_ratio, torch.tensor(1.)) - ##### why we need this time_major, just sample distuibution? if not time_major: discounts = discounts.transpose(0, 1) rewards = rewards.transpose(0, 1) @@ -271,8 +270,6 @@ def generalized_advantage_estimation_retrace(importance_ratio, discounts, reward assert values.shape[0] >= 2, ("The sequence length needs to be " "at least 2. Got {s}".format( s=values.shape[0])) - - #### calcuate the loss not very clear for this function advs = torch.zeros_like(values) is_lasts = (step_types == StepType.LAST).to(dtype=torch.float32) delta = (rewards[1:] + discounts[1:] * target_value[1:] - values[:-1]) diff --git a/alf/utils/value_ops_test.py b/alf/utils/value_ops_test.py index ebd526127..024c12bca 100644 --- a/alf/utils/value_ops_test.py +++ b/alf/utils/value_ops_test.py @@ -170,7 +170,32 @@ def test_generalized_advantage_estimation(self): discounts=discounts, td_lambda=td_lambda, expected=expected) + +class GeneralizedAdvantage_retrace_Test(unittest.TestCase): + """Tests for alf.utils.value_ops + """ + def test_generalized_advantage_estimation_retrace(self): + values = torch.tensor([[2.] * 4], dtype=torch.float32) + step_types = torch.tensor([[StepType.MID] * 4], dtype=torch.int64) + rewards = torch.tensor([[3.] * 4], dtype=torch.float32) + discounts = torch.tensor([[0.9] * 4], dtype=torch.float32) + td_lambda = 0.6/0.9 + target_value = torch.tensor([[3.] * 4], dtype=torch.float32) + importance_ratio = torch.tensor([[0.8] * 3], dtype=torch.float32) + d = 3 * 0.9+ 3 - 2 + expected = torch.tensor([[ (d * 0.6 * 0.8 ) *0.6 * 0.8+ 0.6 * 0.8 * d + d, d * 0.6 * 0.8 + d, d]], + dtype=torch.float32) + np.testing.assert_array_almost_equal( + value_ops.generalized_advantage_estimation_retrace( + rewards=rewards, + values=values, + target_value = target_value, + step_types=step_types, + discounts=discounts, + td_lambda=td_lambda, + importance_ratio = importance_ratio, + time_major=False), expected) if __name__ == '__main__': unittest.main() From 074b5daaac45887bab62c92a42366dde628a89dd Mon Sep 17 00:00:00 2001 From: zhuboli <55901904+zhuboli@users.noreply.github.com> Date: Thu, 8 Oct 2020 15:33:45 -0700 Subject: [PATCH 3/9] fix retrace --- alf/algorithms/td_loss.py | 168 -------------------------------------- 1 file changed, 168 deletions(-) diff --git a/alf/algorithms/td_loss.py b/alf/algorithms/td_loss.py index 3e3289f52..281a7ff1c 100644 --- a/alf/algorithms/td_loss.py +++ b/alf/algorithms/td_loss.py @@ -8,16 +8,6 @@ # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, # Copyright (c) 2019 Horizon Robotics. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. @@ -168,164 +158,6 @@ def _summarize(v, r, td, suffix): loss = self._td_error_loss_fn(returns.detach(), value) - if loss.ndim == 3: - # Multidimensional reward. Average over the critic loss for all dimensions - loss = loss.mean(dim=2) - - # The shape of the loss expected by Algorith.update_with_gradient is - # [T, B], so we need to augment it with additional zeros. - loss = tensor_utils.tensor_extend_zero(loss) - return LossInfo(loss=loss, extra=loss) -either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gin -import torch -import torch.nn as nn - -import alf -from alf.data_structures import LossInfo, StepType -from alf.utils.losses import element_wise_squared_loss -from alf.utils import tensor_utils, value_ops -from alf.utils.summary_utils import safe_mean_hist_summary - - -@gin.configurable -class TDLoss(nn.Module): - def __init__(self, - gamma=0.99, - td_error_loss_fn=element_wise_squared_loss, - td_lambda=0.95, - debug_summaries=False, - name="TDLoss"): - r"""Create a TDLoss object. - - Let :math:`G_{t:T}` be the bootstaped return from t to T: - :math:`G_{t:T} = \sum_{i=t+1}^T \gamma^{t-i-1}R_i + \gamma^{T-t} V(s_T)` - If ``td_lambda`` = 1, the target for step t is :math:`G_{t:T}`. - If ``td_lambda`` = 0, the target for step t is :math:`G_{t:t+1}` - If 0 < ``td_lambda`` < 1, the target for step t is the :math:`\lambda`-return: - :math:`G_t^\lambda = (1 - \lambda) \sum_{i=t+1}^{T-1} \lambda^{i-t}G_{t:i} + \lambda^{T-t-1} G_{t:T}` - There is a simple relationship between :math:`\lambda`-return and - the generalized advantage estimation :math:`\hat{A}^{GAE}_t`: - :math:`G_t^\lambda = \hat{A}^{GAE}_t + V(s_t)` - where the generalized advantage estimation is defined as: - :math:`\hat{A}^{GAE}_t = \sum_{i=t}^{T-1}(\gamma\lambda)^{i-t}(R_{i+1} + \gamma V(s_{i+1}) - V(s_i))` - - References: - - Schulman et al. `High-Dimensional Continuous Control Using Generalized Advantage Estimation - `_ - - Sutton et al. `Reinforcement Learning: An Introduction - `_, Chapter 12, 2018 - - Args: - gamma (float): A discount factor for future rewards. - td_errors_loss_fn (Callable): A function for computing the TD errors - loss. This function takes as input the target and the estimated - Q values and returns the loss for each element of the batch. - td_lambda (float): Lambda parameter for TD-lambda computation. - debug_summaries (bool): True if debug summaries should be created. - name (str): The name of this loss. - """ - super().__init__() - - self._name = name - self._gamma = gamma - self._td_error_loss_fn = td_error_loss_fn - self._lambda = td_lambda - self._debug_summaries = debug_summaries - - def forward(self, experience, value, target_value,train_info = None): - """Cacluate the loss. - - The first dimension of all the tensors is time dimension and the second - dimesion is the batch dimension. - - Args: - train_info (sac_info or sarsa_info): in order to calculate the importance ratio - from info.action_distribution. If no input of train info and lambda is not - 0 and 1,it will use multistep method instead of retrace - experience (Experience): experience collected from ``unroll()`` or - a replay buffer. All tensors are time-major. - value (torch.Tensor): the time-major tensor for the value at each time - step. The loss is between this and the calculated return. - target_value (torch.Tensor): the time-major tensor for the value at - each time step. This is used to calculate return. ``target_value`` - can be same as ``value``. - Returns: - LossInfo: with the ``extra`` field same as ``loss``. - """ - if self._lambda == 1.0: - returns = value_ops.discounted_return( - rewards=experience.reward, - values=target_value, - step_types=experience.step_type, - discounts=experience.discount * self._gamma) - elif self._lambda == 0.0: - returns = value_ops.one_step_discounted_return( - rewards=experience.reward, - values=target_value, - step_types=experience.step_type, - discounts=experience.discount * self._gamma) - elif train_info == None: - advantages = value_ops.generalized_advantage_estimation( - rewards=experience.reward, - values=target_value, - step_types=experience.step_type, - discounts=experience.discount * self._gamma, - td_lambda=self._lambda) - returns = advantages + target_value[:-1] - else: - scope = alf.summary.scope(self.__class__.__name__) - importance_ratio,importance_ratio_clipped = value_ops.action_importance_ratio( - action_distribution=train_info.action_distribution, - collect_action_distribution=experience.rollout_info.action_distribution, - action=experience.action, - clipping_mode='capping', - importance_ratio_clipping= 0.0, - log_prob_clipping= 0.0, - scope=scope, - check_numerics=False, - debug_summaries=True) - advantages = value_ops.generalized_advantage_estimation_retrace( - importance_ratio = importance_ratio_clipped, - rewards=experience.reward, - values= value, - target_value = target_value, - step_types=experience.step_type, - discounts=experience.discount * self._gamma, - time_major = True, - td_lambda=self._lambda) - returns = advantages + value[:-1] - returns = returns.detach() - value = value[:-1] - - if self._debug_summaries and alf.summary.should_record_summaries(): - mask = experience.step_type[:-1] != StepType.LAST - with alf.summary.scope(self._name): - - def _summarize(v, r, td, suffix): - alf.summary.scalar( - "explained_variance_of_return_by_value" + suffix, - tensor_utils.explained_variance(v, r, mask)) - safe_mean_hist_summary('values' + suffix, v, mask) - safe_mean_hist_summary('returns' + suffix, r, mask) - safe_mean_hist_summary("td_error" + suffix, td, mask) - - if value.ndim == 2: - _summarize(value, returns, returns - value, '') - else: - td = returns - value - for i in range(value.shape[2]): - suffix = '/' + str(i) - _summarize(value[..., i], returns[..., i], td[..., i], - suffix) - - loss = self._td_error_loss_fn(returns.detach(), value) - if loss.ndim == 3: # Multidimensional reward. Average over the critic loss for all dimensions loss = loss.mean(dim=2) From 07b5929809bf364c26f8f52a60c4da1796178446 Mon Sep 17 00:00:00 2001 From: zhuboli <55901904+zhuboli@users.noreply.github.com> Date: Thu, 8 Oct 2020 15:34:55 -0700 Subject: [PATCH 4/9] fix retrace --- alf/utils/value_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alf/utils/value_ops.py b/alf/utils/value_ops.py index a93d2cb4d..2bf336ed2 100644 --- a/alf/utils/value_ops.py +++ b/alf/utils/value_ops.py @@ -258,7 +258,7 @@ def generalized_advantage_estimation(rewards, ####### add for the retrace method def generalized_advantage_estimation_retrace(importance_ratio, discounts, rewards, td_lambda, time_major, values, target_value,step_types): - #importance_ratio = torch.min(importance_ratio, torch.tensor(1.)) + if not time_major: discounts = discounts.transpose(0, 1) rewards = rewards.transpose(0, 1) From 1219ef1325ea66630d33e8670c7ceee4773c9c30 Mon Sep 17 00:00:00 2001 From: zhuboli <55901904+zhuboli@users.noreply.github.com> Date: Mon, 19 Oct 2020 14:58:04 -0700 Subject: [PATCH 5/9] fix conflicts --- alf/algorithms/td_loss.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/alf/algorithms/td_loss.py b/alf/algorithms/td_loss.py index 281a7ff1c..6c1827719 100644 --- a/alf/algorithms/td_loss.py +++ b/alf/algorithms/td_loss.py @@ -29,6 +29,7 @@ def __init__(self, gamma=0.99, td_error_loss_fn=element_wise_squared_loss, td_lambda=0.95, + normalize_target=False, use_retrace=0, debug_summaries=False, name="TDLoss"): @@ -70,6 +71,8 @@ def __init__(self, self._td_error_loss_fn = td_error_loss_fn self._lambda = td_lambda self._debug_summaries = debug_summaries + self._normalize_target = normalize_target + self._target_normalizer = None self._use_retrace = use_retrace def forward(self, experience, value, target_value, train_info): """Cacluate the loss. From 027c817f9a1dbba9a4366edf47fe3452328699f7 Mon Sep 17 00:00:00 2001 From: zhuboli <55901904+zhuboli@users.noreply.github.com> Date: Fri, 23 Oct 2020 17:25:44 -0700 Subject: [PATCH 6/9] fix retrace --- alf/algorithms/sarsa_algorithm.py | 2 +- alf/algorithms/td_loss.py | 36 +++++++++++++++---------------- alf/utils/value_ops.py | 31 ++++++++++++++++++++------ alf/utils/value_ops_test.py | 18 ++++++++++------ 4 files changed, 54 insertions(+), 33 deletions(-) diff --git a/alf/algorithms/sarsa_algorithm.py b/alf/algorithms/sarsa_algorithm.py index 7c22fcb1b..c451b52df 100644 --- a/alf/algorithms/sarsa_algorithm.py +++ b/alf/algorithms/sarsa_algorithm.py @@ -435,7 +435,7 @@ def calc_loss(self, experience, info: SarsaInfo): target_critic = tensor_utils.tensor_prepend_zero( info.target_critics) loss_info = self._critic_losses[i](shifted_experience, critic, - target_critic,info) + target_critic, info) critic_losses.append(nest_map(lambda l: l[:-1], loss_info.loss)) critic_loss = math_ops.add_n(critic_losses) diff --git a/alf/algorithms/td_loss.py b/alf/algorithms/td_loss.py index dc2588ad2..06354f75a 100644 --- a/alf/algorithms/td_loss.py +++ b/alf/algorithms/td_loss.py @@ -31,10 +31,7 @@ def __init__(self, td_error_loss_fn=element_wise_squared_loss, td_lambda=0.95, normalize_target=False, - some-feature-retrace - use_retrace=0, - - pytorch + use_retrace=False, debug_summaries=False, name="TDLoss"): r"""Create a TDLoss object. @@ -51,6 +48,7 @@ def __init__(self, where the generalized advantage estimation is defined as: :math:`\hat{A}^{GAE}_t = \sum_{i=t}^{T-1}(\gamma\lambda)^{i-t}(R_{i+1} + \gamma V(s_{i+1}) - V(s_i))` use_retrace = 0 means one step or multi_step loss, use_retrace = 1 means retrace loss + :math:`\mathcal{R} Q(x, a):=Q(x, a)+\mathbb{E}_{\mu}\left[\sum_{t \geq 0} \gamma^{t}\left(\prod_{s=1}^{t} c_{s}\right)\left(r_{t}+\gamma \mathbb{E}_{\pi} Q\left(x_{t+1}, \cdot\right)-Q\left(x_{t}, a_{t}\right)\right)\right]` References: Schulman et al. `High-Dimensional Continuous Control Using Generalized Advantage Estimation @@ -59,6 +57,9 @@ def __init__(self, Sutton et al. `Reinforcement Learning: An Introduction `_, Chapter 12, 2018 + Remi Munos et al. `Safe and efficient off-policy reinforcement learning + `_ + Args: gamma (float): A discount factor for future rewards. td_errors_loss_fn (Callable): A function for computing the TD errors @@ -80,13 +81,9 @@ def __init__(self, self._debug_summaries = debug_summaries self._normalize_target = normalize_target self._target_normalizer = None - some-feature-retrace self._use_retrace = use_retrace - def forward(self, experience, value, target_value, train_info): - - def forward(self, experience, value, target_value): - pytorch + def forward(self, experience, value, target_value, train_info): """Cacluate the loss. The first dimension of all the tensors is time dimension and the second @@ -117,7 +114,7 @@ def forward(self, experience, value, target_value): values=target_value, step_types=experience.step_type, discounts=experience.discount * self._gamma) - elif self._use_retrace == 0: + elif self._use_retrace == False: advantages = value_ops.generalized_advantage_estimation( rewards=experience.reward, values=target_value, @@ -126,25 +123,26 @@ def forward(self, experience, value, target_value): td_lambda=self._lambda) returns = advantages + target_value[:-1] else: - scope = alf.summary.scope(self.__class__.__name__) - importance_ratio,importance_ratio_clipped = value_ops.action_importance_ratio( + scope = alf.summary.scope(self.__class__.__name__) + importance_ratio, importance_ratio_clipped = value_ops.action_importance_ratio( action_distribution=train_info.action_distribution, - collect_action_distribution=experience.rollout_info.action_distribution, + collect_action_distribution=experience.rollout_info. + action_distribution, action=experience.action, clipping_mode='capping', - importance_ratio_clipping= 0.0, - log_prob_clipping= 0.0, + importance_ratio_clipping=0.0, + log_prob_clipping=0.0, scope=scope, check_numerics=False, debug_summaries=True) advantages = value_ops.generalized_advantage_estimation_retrace( - importance_ratio = importance_ratio_clipped, + importance_ratio=importance_ratio_clipped, rewards=experience.reward, - values= value, - target_value = target_value, + values=value, + target_value=target_value, step_types=experience.step_type, discounts=experience.discount * self._gamma, - time_major = True, + time_major=True, td_lambda=self._lambda) returns = advantages + value[:-1] returns = returns.detach() diff --git a/alf/utils/value_ops.py b/alf/utils/value_ops.py index 2bf336ed2..75f379d1f 100644 --- a/alf/utils/value_ops.py +++ b/alf/utils/value_ops.py @@ -256,16 +256,36 @@ def generalized_advantage_estimation(rewards, return advs.detach() + ####### add for the retrace method -def generalized_advantage_estimation_retrace(importance_ratio, discounts, rewards, td_lambda, time_major, values, target_value,step_types): - +def generalized_advantage_estimation_retrace(importance_ratio, discounts, + rewards, td_lambda, time_major, + values, target_value, step_types): + """ + compute the generalized advantage estimation for retrace method. Main change is adding + importance ratio + + Args: + importance_ratio: shape is [T], scalar between [0,1]. representing importance ratio + rewards (Tensor): shape is [T, B] (or [T]) representing rewards. + values (Tensor): shape is [T,B] (or [T]) representing values. + step_types (Tensor): shape is [T,B] (or [T]) representing step types. + discounts (Tensor): shape is [T, B] (or [T]) representing discounts. + td_lambda (float): A scalar between [0, 1]. It's used for variance + reduction in temporal difference. + time_major (bool): Whether input tensors are time major. + False means input tensors have shape [B, T]. + Returns: + A tensor with shape [T-1, B] representing advantages. Shape is [B, T-1] + when time_major is false. + """ if not time_major: discounts = discounts.transpose(0, 1) rewards = rewards.transpose(0, 1) values = values.transpose(0, 1) step_types = step_types.transpose(0, 1) - importance_ratio = importance_ratio.transpose(0,1) - target_value = target_value.transpose(0,1) + importance_ratio = importance_ratio.transpose(0, 1) + target_value = target_value.transpose(0, 1) assert values.shape[0] >= 2, ("The sequence length needs to be " "at least 2. Got {s}".format( @@ -274,7 +294,6 @@ def generalized_advantage_estimation_retrace(importance_ratio, discounts, reward is_lasts = (step_types == StepType.LAST).to(dtype=torch.float32) delta = (rewards[1:] + discounts[1:] * target_value[1:] - values[:-1]) - weighted_discounts = discounts[1:] * td_lambda * importance_ratio with torch.no_grad(): for t in reversed(range(rewards.shape[0] - 1)): @@ -284,4 +303,4 @@ def generalized_advantage_estimation_retrace(importance_ratio, discounts, reward if not time_major: advs = advs.transpose(0, 1) - return advs.detach() \ No newline at end of file + return advs.detach() diff --git a/alf/utils/value_ops_test.py b/alf/utils/value_ops_test.py index 024c12bca..106f0d58e 100644 --- a/alf/utils/value_ops_test.py +++ b/alf/utils/value_ops_test.py @@ -170,7 +170,8 @@ def test_generalized_advantage_estimation(self): discounts=discounts, td_lambda=td_lambda, expected=expected) - + + class GeneralizedAdvantage_retrace_Test(unittest.TestCase): """Tests for alf.utils.value_ops """ @@ -180,22 +181,25 @@ def test_generalized_advantage_estimation_retrace(self): step_types = torch.tensor([[StepType.MID] * 4], dtype=torch.int64) rewards = torch.tensor([[3.] * 4], dtype=torch.float32) discounts = torch.tensor([[0.9] * 4], dtype=torch.float32) - td_lambda = 0.6/0.9 + td_lambda = 0.6 / 0.9 target_value = torch.tensor([[3.] * 4], dtype=torch.float32) importance_ratio = torch.tensor([[0.8] * 3], dtype=torch.float32) - d = 3 * 0.9+ 3 - 2 - expected = torch.tensor([[ (d * 0.6 * 0.8 ) *0.6 * 0.8+ 0.6 * 0.8 * d + d, d * 0.6 * 0.8 + d, d]], - dtype=torch.float32) + d = 3 * 0.9 + 3 - 2 + expected = torch.tensor( + [[(d * 0.6 * 0.8) * 0.6 * 0.8 + 0.6 * 0.8 * d + d, + d * 0.6 * 0.8 + d, d]], + dtype=torch.float32) np.testing.assert_array_almost_equal( value_ops.generalized_advantage_estimation_retrace( rewards=rewards, values=values, - target_value = target_value, + target_value=target_value, step_types=step_types, discounts=discounts, td_lambda=td_lambda, - importance_ratio = importance_ratio, + importance_ratio=importance_ratio, time_major=False), expected) + if __name__ == '__main__': unittest.main() From 23d59456ce365a6c8ff7fd3f45be29763ca712ab Mon Sep 17 00:00:00 2001 From: zhuboli <55901904+zhuboli@users.noreply.github.com> Date: Tue, 27 Oct 2020 18:41:28 -0700 Subject: [PATCH 7/9] still need merge advantage function --- alf/algorithms/td_loss.py | 14 +++++++++----- alf/utils/value_ops.py | 4 ++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/alf/algorithms/td_loss.py b/alf/algorithms/td_loss.py index 06354f75a..44b6379ef 100644 --- a/alf/algorithms/td_loss.py +++ b/alf/algorithms/td_loss.py @@ -47,7 +47,7 @@ def __init__(self, :math:`G_t^\lambda = \hat{A}^{GAE}_t + V(s_t)` where the generalized advantage estimation is defined as: :math:`\hat{A}^{GAE}_t = \sum_{i=t}^{T-1}(\gamma\lambda)^{i-t}(R_{i+1} + \gamma V(s_{i+1}) - V(s_i))` - use_retrace = 0 means one step or multi_step loss, use_retrace = 1 means retrace loss + use_retrace = False means one step or multi_step loss, use_retrace = True means retrace loss :math:`\mathcal{R} Q(x, a):=Q(x, a)+\mathbb{E}_{\mu}\left[\sum_{t \geq 0} \gamma^{t}\left(\prod_{s=1}^{t} c_{s}\right)\left(r_{t}+\gamma \mathbb{E}_{\pi} Q\left(x_{t+1}, \cdot\right)-Q\left(x_{t}, a_{t}\right)\right)\right]` References: @@ -97,8 +97,11 @@ def forward(self, experience, value, target_value, train_info): target_value (torch.Tensor): the time-major tensor for the value at each time step. This is used to calculate return. ``target_value`` can be same as ``value``. - train_info (sarsa info, sac info): information used to calcuate importance_ratio - or importance_ratio_clipped + train_info : train_info includes action distrbution, actor, critic and + other information. Different algorithm may have different info inside. + For the retrace method, we can use SarsaInfo, SacInfo or DdpgInfo as train_info + for Sac, Sarsa or Ddpg algorithm. Adding train_info to calculate importance_ratio + and importance_ratio_clipped. Returns: LossInfo: with the ``extra`` field same as ``loss``. """ @@ -124,7 +127,8 @@ def forward(self, experience, value, target_value, train_info): returns = advantages + target_value[:-1] else: scope = alf.summary.scope(self.__class__.__name__) - importance_ratio, importance_ratio_clipped = value_ops.action_importance_ratio( + importance_ratio, importance_ratio_clipped = value_ops. \ + action_importance_ratio( action_distribution=train_info.action_distribution, collect_action_distribution=experience.rollout_info. action_distribution, @@ -134,7 +138,7 @@ def forward(self, experience, value, target_value, train_info): log_prob_clipping=0.0, scope=scope, check_numerics=False, - debug_summaries=True) + debug_summaries=self._debug_summaries) advantages = value_ops.generalized_advantage_estimation_retrace( importance_ratio=importance_ratio_clipped, rewards=experience.reward, diff --git a/alf/utils/value_ops.py b/alf/utils/value_ops.py index 75f379d1f..66e685027 100644 --- a/alf/utils/value_ops.py +++ b/alf/utils/value_ops.py @@ -257,7 +257,7 @@ def generalized_advantage_estimation(rewards, return advs.detach() -####### add for the retrace method +# add for the retrace method def generalized_advantage_estimation_retrace(importance_ratio, discounts, rewards, td_lambda, time_major, values, target_value, step_types): @@ -266,7 +266,7 @@ def generalized_advantage_estimation_retrace(importance_ratio, discounts, importance ratio Args: - importance_ratio: shape is [T], scalar between [0,1]. representing importance ratio + importance_ratio: shape is [T], scalar between [0,1]. Representing importance ratio rewards (Tensor): shape is [T, B] (or [T]) representing rewards. values (Tensor): shape is [T,B] (or [T]) representing values. step_types (Tensor): shape is [T,B] (or [T]) representing step types. From 5cafd48932f690bf16b1f9c17d55b33eb788b39f Mon Sep 17 00:00:00 2001 From: zhuboli <55901904+zhuboli@users.noreply.github.com> Date: Thu, 26 Nov 2020 07:51:34 +0800 Subject: [PATCH 8/9] merge function and fix bug --- .my_venv/bin/activate | 76 +++++++++++++++++++++++++++++ .my_venv/bin/activate.csh | 37 ++++++++++++++ .my_venv/bin/activate.fish | 75 ++++++++++++++++++++++++++++ .my_venv/bin/easy_install | 10 ++++ .my_venv/bin/easy_install-3.7 | 10 ++++ .my_venv/bin/pip | 10 ++++ .my_venv/bin/pip3 | 10 ++++ .my_venv/bin/pip3.7 | 10 ++++ .my_venv/bin/python | 1 + .my_venv/bin/python3 | 1 + .my_venv/pyvenv.cfg | 3 ++ alf/.vscode/launch.json | 22 +++++++++ alf/algorithms/actor_critic_loss.py | 5 +- alf/algorithms/ppo_algorithm.py | 3 ++ alf/algorithms/td_loss.py | 22 ++++++++- alf/examples/carla.gin | 4 +- alf/utils/value_ops.py | 32 ++++++++---- alf/utils/value_ops_test.py | 28 +++++++++-- setup.py | 2 +- 19 files changed, 341 insertions(+), 20 deletions(-) create mode 100644 .my_venv/bin/activate create mode 100644 .my_venv/bin/activate.csh create mode 100644 .my_venv/bin/activate.fish create mode 100755 .my_venv/bin/easy_install create mode 100755 .my_venv/bin/easy_install-3.7 create mode 100755 .my_venv/bin/pip create mode 100755 .my_venv/bin/pip3 create mode 100755 .my_venv/bin/pip3.7 create mode 120000 .my_venv/bin/python create mode 120000 .my_venv/bin/python3 create mode 100644 .my_venv/pyvenv.cfg create mode 100644 alf/.vscode/launch.json diff --git a/.my_venv/bin/activate b/.my_venv/bin/activate new file mode 100644 index 000000000..1db2cc311 --- /dev/null +++ b/.my_venv/bin/activate @@ -0,0 +1,76 @@ +# This file must be used with "source bin/activate" *from bash* +# you cannot run it directly + +deactivate () { + # reset old environment variables + if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then + PATH="${_OLD_VIRTUAL_PATH:-}" + export PATH + unset _OLD_VIRTUAL_PATH + fi + if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then + PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}" + export PYTHONHOME + unset _OLD_VIRTUAL_PYTHONHOME + fi + + # This should detect bash and zsh, which have a hash command that must + # be called to get it to forget past commands. Without forgetting + # past commands the $PATH changes we made may not be respected + if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then + hash -r + fi + + if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then + PS1="${_OLD_VIRTUAL_PS1:-}" + export PS1 + unset _OLD_VIRTUAL_PS1 + fi + + unset VIRTUAL_ENV + if [ ! "${1:-}" = "nondestructive" ] ; then + # Self destruct! + unset -f deactivate + fi +} + +# unset irrelevant variables +deactivate nondestructive + +VIRTUAL_ENV="/Users/lizhubo/workspace/alf/.my_venv" +export VIRTUAL_ENV + +_OLD_VIRTUAL_PATH="$PATH" +PATH="$VIRTUAL_ENV/bin:$PATH" +export PATH + +# unset PYTHONHOME if set +# this will fail if PYTHONHOME is set to the empty string (which is bad anyway) +# could use `if (set -u; : $PYTHONHOME) ;` in bash +if [ -n "${PYTHONHOME:-}" ] ; then + _OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}" + unset PYTHONHOME +fi + +if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then + _OLD_VIRTUAL_PS1="${PS1:-}" + if [ "x(.my_venv) " != x ] ; then + PS1="(.my_venv) ${PS1:-}" + else + if [ "`basename \"$VIRTUAL_ENV\"`" = "__" ] ; then + # special case for Aspen magic directories + # see http://www.zetadev.com/software/aspen/ + PS1="[`basename \`dirname \"$VIRTUAL_ENV\"\``] $PS1" + else + PS1="(`basename \"$VIRTUAL_ENV\"`)$PS1" + fi + fi + export PS1 +fi + +# This should detect bash and zsh, which have a hash command that must +# be called to get it to forget past commands. Without forgetting +# past commands the $PATH changes we made may not be respected +if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then + hash -r +fi diff --git a/.my_venv/bin/activate.csh b/.my_venv/bin/activate.csh new file mode 100644 index 000000000..ec285bc16 --- /dev/null +++ b/.my_venv/bin/activate.csh @@ -0,0 +1,37 @@ +# This file must be used with "source bin/activate.csh" *from csh*. +# You cannot run it directly. +# Created by Davide Di Blasi . +# Ported to Python 3.3 venv by Andrew Svetlov + +alias deactivate 'test $?_OLD_VIRTUAL_PATH != 0 && setenv PATH "$_OLD_VIRTUAL_PATH" && unset _OLD_VIRTUAL_PATH; rehash; test $?_OLD_VIRTUAL_PROMPT != 0 && set prompt="$_OLD_VIRTUAL_PROMPT" && unset _OLD_VIRTUAL_PROMPT; unsetenv VIRTUAL_ENV; test "\!:*" != "nondestructive" && unalias deactivate' + +# Unset irrelevant variables. +deactivate nondestructive + +setenv VIRTUAL_ENV "/Users/lizhubo/workspace/alf/.my_venv" + +set _OLD_VIRTUAL_PATH="$PATH" +setenv PATH "$VIRTUAL_ENV/bin:$PATH" + + +set _OLD_VIRTUAL_PROMPT="$prompt" + +if (! "$?VIRTUAL_ENV_DISABLE_PROMPT") then + if (".my_venv" != "") then + set env_name = ".my_venv" + else + if (`basename "VIRTUAL_ENV"` == "__") then + # special case for Aspen magic directories + # see http://www.zetadev.com/software/aspen/ + set env_name = `basename \`dirname "$VIRTUAL_ENV"\`` + else + set env_name = `basename "$VIRTUAL_ENV"` + endif + endif + set prompt = "[$env_name] $prompt" + unset env_name +endif + +alias pydoc python -m pydoc + +rehash diff --git a/.my_venv/bin/activate.fish b/.my_venv/bin/activate.fish new file mode 100644 index 000000000..54a7ffcfb --- /dev/null +++ b/.my_venv/bin/activate.fish @@ -0,0 +1,75 @@ +# This file must be used with ". bin/activate.fish" *from fish* (http://fishshell.org) +# you cannot run it directly + +function deactivate -d "Exit virtualenv and return to normal shell environment" + # reset old environment variables + if test -n "$_OLD_VIRTUAL_PATH" + set -gx PATH $_OLD_VIRTUAL_PATH + set -e _OLD_VIRTUAL_PATH + end + if test -n "$_OLD_VIRTUAL_PYTHONHOME" + set -gx PYTHONHOME $_OLD_VIRTUAL_PYTHONHOME + set -e _OLD_VIRTUAL_PYTHONHOME + end + + if test -n "$_OLD_FISH_PROMPT_OVERRIDE" + functions -e fish_prompt + set -e _OLD_FISH_PROMPT_OVERRIDE + functions -c _old_fish_prompt fish_prompt + functions -e _old_fish_prompt + end + + set -e VIRTUAL_ENV + if test "$argv[1]" != "nondestructive" + # Self destruct! + functions -e deactivate + end +end + +# unset irrelevant variables +deactivate nondestructive + +set -gx VIRTUAL_ENV "/Users/lizhubo/workspace/alf/.my_venv" + +set -gx _OLD_VIRTUAL_PATH $PATH +set -gx PATH "$VIRTUAL_ENV/bin" $PATH + +# unset PYTHONHOME if set +if set -q PYTHONHOME + set -gx _OLD_VIRTUAL_PYTHONHOME $PYTHONHOME + set -e PYTHONHOME +end + +if test -z "$VIRTUAL_ENV_DISABLE_PROMPT" + # fish uses a function instead of an env var to generate the prompt. + + # save the current fish_prompt function as the function _old_fish_prompt + functions -c fish_prompt _old_fish_prompt + + # with the original prompt function renamed, we can override with our own. + function fish_prompt + # Save the return status of the last command + set -l old_status $status + + # Prompt override? + if test -n "(.my_venv) " + printf "%s%s" "(.my_venv) " (set_color normal) + else + # ...Otherwise, prepend env + set -l _checkbase (basename "$VIRTUAL_ENV") + if test $_checkbase = "__" + # special case for Aspen magic directories + # see http://www.zetadev.com/software/aspen/ + printf "%s[%s]%s " (set_color -b blue white) (basename (dirname "$VIRTUAL_ENV")) (set_color normal) + else + printf "%s(%s)%s" (set_color -b blue white) (basename "$VIRTUAL_ENV") (set_color normal) + end + end + + # Restore the return status of the previous command. + echo "exit $old_status" | . + _old_fish_prompt + end + + set -gx _OLD_FISH_PROMPT_OVERRIDE "$VIRTUAL_ENV" +end diff --git a/.my_venv/bin/easy_install b/.my_venv/bin/easy_install new file mode 100755 index 000000000..f630a2891 --- /dev/null +++ b/.my_venv/bin/easy_install @@ -0,0 +1,10 @@ +#!/Users/lizhubo/workspace/alf/.my_venv/bin/python +# -*- coding: utf-8 -*- +import re +import sys + +from setuptools.command.easy_install import main + +if __name__ == '__main__': + sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0]) + sys.exit(main()) diff --git a/.my_venv/bin/easy_install-3.7 b/.my_venv/bin/easy_install-3.7 new file mode 100755 index 000000000..f630a2891 --- /dev/null +++ b/.my_venv/bin/easy_install-3.7 @@ -0,0 +1,10 @@ +#!/Users/lizhubo/workspace/alf/.my_venv/bin/python +# -*- coding: utf-8 -*- +import re +import sys + +from setuptools.command.easy_install import main + +if __name__ == '__main__': + sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0]) + sys.exit(main()) diff --git a/.my_venv/bin/pip b/.my_venv/bin/pip new file mode 100755 index 000000000..f51e24700 --- /dev/null +++ b/.my_venv/bin/pip @@ -0,0 +1,10 @@ +#!/Users/lizhubo/workspace/alf/.my_venv/bin/python +# -*- coding: utf-8 -*- +import re +import sys + +from pip._internal import main + +if __name__ == '__main__': + sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0]) + sys.exit(main()) diff --git a/.my_venv/bin/pip3 b/.my_venv/bin/pip3 new file mode 100755 index 000000000..f51e24700 --- /dev/null +++ b/.my_venv/bin/pip3 @@ -0,0 +1,10 @@ +#!/Users/lizhubo/workspace/alf/.my_venv/bin/python +# -*- coding: utf-8 -*- +import re +import sys + +from pip._internal import main + +if __name__ == '__main__': + sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0]) + sys.exit(main()) diff --git a/.my_venv/bin/pip3.7 b/.my_venv/bin/pip3.7 new file mode 100755 index 000000000..f51e24700 --- /dev/null +++ b/.my_venv/bin/pip3.7 @@ -0,0 +1,10 @@ +#!/Users/lizhubo/workspace/alf/.my_venv/bin/python +# -*- coding: utf-8 -*- +import re +import sys + +from pip._internal import main + +if __name__ == '__main__': + sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0]) + sys.exit(main()) diff --git a/.my_venv/bin/python b/.my_venv/bin/python new file mode 120000 index 000000000..3381f8782 --- /dev/null +++ b/.my_venv/bin/python @@ -0,0 +1 @@ +/opt/anaconda3/bin/python \ No newline at end of file diff --git a/.my_venv/bin/python3 b/.my_venv/bin/python3 new file mode 120000 index 000000000..d8654aa0e --- /dev/null +++ b/.my_venv/bin/python3 @@ -0,0 +1 @@ +python \ No newline at end of file diff --git a/.my_venv/pyvenv.cfg b/.my_venv/pyvenv.cfg new file mode 100644 index 000000000..caaaab722 --- /dev/null +++ b/.my_venv/pyvenv.cfg @@ -0,0 +1,3 @@ +home = /opt/anaconda3/bin +include-system-site-packages = false +version = 3.7.6 diff --git a/alf/.vscode/launch.json b/alf/.vscode/launch.json new file mode 100644 index 000000000..0024ede62 --- /dev/null +++ b/alf/.vscode/launch.json @@ -0,0 +1,22 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "train", + "type": "python", + "request": "launch", + "cwd": "/Users/lizhubo/workspace/alf/alf/examples", + "program": "/Users/lizhubo/workspace/alf/alf/bin/train.py", + "env": {"DISPLAY": ":8", + "CUDA_VISIBLE_DEVICES": "0"}, + "args": [ + "--root_dir=/Users/lizhubo/Desktop/Pytorch/Results50", + "--gin_file=/Users/lizhubo/workspace/alf/alf/examples/sarsa_sac_pendulum.gin", + ], + + } + ] +} diff --git a/alf/algorithms/actor_critic_loss.py b/alf/algorithms/actor_critic_loss.py index 1a28727b2..2092dfe51 100644 --- a/alf/algorithms/actor_critic_loss.py +++ b/alf/algorithms/actor_critic_loss.py @@ -169,7 +169,10 @@ def _calc_returns_and_advantages(self, experience, value): values=value, step_types=experience.step_type, discounts=experience.discount * self._gamma, - td_lambda=self._lambda) + target_value=value, + td_lambda=self._lambda, + importance_ratio=1.0, + use_retrace=False) advantages = tensor_utils.tensor_extend_zero(advantages) if self._use_td_lambda_return: returns = advantages + value diff --git a/alf/algorithms/ppo_algorithm.py b/alf/algorithms/ppo_algorithm.py index 72858d7b3..abdb59954 100644 --- a/alf/algorithms/ppo_algorithm.py +++ b/alf/algorithms/ppo_algorithm.py @@ -46,7 +46,10 @@ def preprocess_experience(self, exp: Experience): values=exp.rollout_info.value, step_types=exp.step_type, discounts=exp.discount * self._loss._gamma, + target_value=exp.rollout_info.value, td_lambda=self._loss._lambda, + importance_ratio=1.0, + use_retrace=False, time_major=False) advantages = torch.cat([ advantages, diff --git a/alf/algorithms/td_loss.py b/alf/algorithms/td_loss.py index 44b6379ef..49b96cf06 100644 --- a/alf/algorithms/td_loss.py +++ b/alf/algorithms/td_loss.py @@ -99,7 +99,7 @@ def forward(self, experience, value, target_value, train_info): can be same as ``value``. train_info : train_info includes action distrbution, actor, critic and other information. Different algorithm may have different info inside. - For the retrace method, we can use SarsaInfo, SacInfo or DdpgInfo as train_info + For the retrace method, we can use SarsaInfo, SacInfo or DdpgInfo as train_info for Sac, Sarsa or Ddpg algorithm. Adding train_info to calculate importance_ratio and importance_ratio_clipped. Returns: @@ -118,10 +118,26 @@ def forward(self, experience, value, target_value, train_info): step_types=experience.step_type, discounts=experience.discount * self._gamma) elif self._use_retrace == False: + scope = alf.summary.scope(self.__class__.__name__) + importance_ratio, importance_ratio_clipped = value_ops. \ + action_importance_ratio( + action_distribution=train_info.action_distribution, + collect_action_distribution=experience.rollout_info. + action_distribution, + action=experience.action, + clipping_mode='capping', + importance_ratio_clipping=0.0, + log_prob_clipping=0.0, + scope=scope, + check_numerics=False, + debug_summaries=self._debug_summaries) advantages = value_ops.generalized_advantage_estimation( rewards=experience.reward, values=target_value, step_types=experience.step_type, + target_value=target_value, + importance_ratio=importance_ratio, + use_retrace=False, discounts=experience.discount * self._gamma, td_lambda=self._lambda) returns = advantages + target_value[:-1] @@ -139,15 +155,17 @@ def forward(self, experience, value, target_value, train_info): scope=scope, check_numerics=False, debug_summaries=self._debug_summaries) - advantages = value_ops.generalized_advantage_estimation_retrace( + advantages = value_ops.generalized_advantage_estimation( importance_ratio=importance_ratio_clipped, rewards=experience.reward, values=value, target_value=target_value, step_types=experience.step_type, discounts=experience.discount * self._gamma, + use_retrace=True, time_major=True, td_lambda=self._lambda) + returns = advantages + value[:-1] returns = returns.detach() value = value[:-1] diff --git a/alf/examples/carla.gin b/alf/examples/carla.gin index 39a1d1f66..e42a2c8b7 100644 --- a/alf/examples/carla.gin +++ b/alf/examples/carla.gin @@ -4,8 +4,8 @@ import alf import alf.algorithms.merlin_algorithm import alf.environments.suite_carla -CameraSensor.image_size_x=200 -CameraSensor.image_size_y=100 +CameraSensor.image_size_x=128 +CameraSensor.image_size_y=64 CameraSensor.fov=135 create_environment.env_name='Town01' diff --git a/alf/utils/value_ops.py b/alf/utils/value_ops.py index 66e685027..583c14954 100644 --- a/alf/utils/value_ops.py +++ b/alf/utils/value_ops.py @@ -195,6 +195,9 @@ def generalized_advantage_estimation(rewards, values, step_types, discounts, + target_value, + importance_ratio, + use_retrace=False, td_lambda=1.0, time_major=True): """Computes generalized advantage estimation (GAE) for the first T-1 steps. @@ -231,6 +234,8 @@ def generalized_advantage_estimation(rewards, rewards = rewards.transpose(0, 1) values = values.transpose(0, 1) step_types = step_types.transpose(0, 1) + importance_ratio = importance_ratio.transpose(0, 1) + target_value = target_value.transpose(0, 1) assert values.shape[0] >= 2, ("The sequence length needs to be " "at least 2. Got {s}".format( @@ -240,16 +245,23 @@ def generalized_advantage_estimation(rewards, is_lasts = common.expand_dims_as(is_lasts, values) discounts = common.expand_dims_as(discounts, values) - weighted_discounts = discounts[1:] * td_lambda - advs = torch.zeros_like(values) - delta = rewards[1:] + discounts[1:] * values[1:] - values[:-1] - - with torch.no_grad(): - for t in reversed(range(rewards.shape[0] - 1)): - advs[t] = (1 - is_lasts[t]) * \ - (delta[t] + weighted_discounts[t] * advs[t + 1]) - advs = advs[:-1] + if use_retrace == False: + weighted_discounts = discounts[1:] * td_lambda + delta = rewards[1:] + discounts[1:] * values[1:] - values[:-1] + with torch.no_grad(): + for t in reversed(range(rewards.shape[0] - 1)): + advs[t] = (1 - is_lasts[t]) * \ + (delta[t] + weighted_discounts[t] * advs[t + 1]) + advs = advs[:-1] + else: + delta = (rewards[1:] + discounts[1:] * target_value[1:] - values[:-1]) + weighted_discounts = discounts[1:] * td_lambda * importance_ratio + with torch.no_grad(): + for t in reversed(range(rewards.shape[0] - 1)): + advs[t] = (1 - is_lasts[t]) * \ + (delta[t] + weighted_discounts[t] * advs[t + 1]) + advs = advs[:-1] if not time_major: advs = advs.transpose(0, 1) @@ -257,6 +269,7 @@ def generalized_advantage_estimation(rewards, return advs.detach() +''' # add for the retrace method def generalized_advantage_estimation_retrace(importance_ratio, discounts, rewards, td_lambda, time_major, @@ -304,3 +317,4 @@ def generalized_advantage_estimation_retrace(importance_ratio, discounts, advs = advs.transpose(0, 1) return advs.detach() +''' diff --git a/alf/utils/value_ops_test.py b/alf/utils/value_ops_test.py index 106f0d58e..cd5d19f39 100644 --- a/alf/utils/value_ops_test.py +++ b/alf/utils/value_ops_test.py @@ -96,14 +96,17 @@ class GeneralizedAdvantageTest(unittest.TestCase): """Tests for alf.utils.value_ops.generalized_advantage_estimation """ - def _check(self, rewards, values, step_types, discounts, td_lambda, - expected): + def _check(self, rewards, values, step_types, discounts, target_value, + importance_ratio, use_retrace, td_lambda, expected): np.testing.assert_array_almost_equal( value_ops.generalized_advantage_estimation( rewards=rewards, values=values, step_types=step_types, discounts=discounts, + target_value=target_value, + importance_ratio=importance_ratio, + use_retrace=use_retrace, td_lambda=td_lambda, time_major=False), expected) @@ -113,6 +116,9 @@ def _check(self, rewards, values, step_types, discounts, td_lambda, values=torch.stack([values, 2 * values], dim=2), step_types=step_types, discounts=discounts, + importance_ratio=importance_ratio, + target_value=target_value, + use_retrace=use_retrace, td_lambda=td_lambda, time_major=False), torch.stack([expected, 2 * expected], dim=2), @@ -124,7 +130,9 @@ def test_generalized_advantage_estimation(self): rewards = torch.tensor([[3.] * 5], dtype=torch.float32) discounts = torch.tensor([[0.9] * 5], dtype=torch.float32) td_lambda = 0.6 / 0.9 - + target_value = torch.tensor([[3.] * 4], dtype=torch.float32) + importance_ratio = torch.tensor([[0.8] * 3], dtype=torch.float32) + use_retrace = False d = 2 * 0.9 + 1 expected = torch.tensor([[((d * 0.6 + d) * 0.6 + d) * 0.6 + d, (d * 0.6 + d) * 0.6 + d, d * 0.6 + d, d]], @@ -134,7 +142,10 @@ def test_generalized_advantage_estimation(self): values=values, step_types=step_types, discounts=discounts, + importance_ratio=importance_ratio, + target_value=target_value, td_lambda=td_lambda, + use_retrace=use_retrace, expected=expected) # two episodes, and exceed by time limit (discount=1) @@ -150,7 +161,10 @@ def test_generalized_advantage_estimation(self): values=values, step_types=step_types, discounts=discounts, + importance_ratio=importance_ratio, + target_value=target_value, td_lambda=td_lambda, + use_retrace=use_retrace, expected=expected) # tow episodes, and end normal (discount=0) @@ -169,12 +183,16 @@ def test_generalized_advantage_estimation(self): step_types=step_types, discounts=discounts, td_lambda=td_lambda, + importance_ratio=importance_ratio, + target_value=target_value, + use_retrace=use_retrace, expected=expected) +''' class GeneralizedAdvantage_retrace_Test(unittest.TestCase): """Tests for alf.utils.value_ops - """ + """GeneralizedAdvantageTest.test_generalized_advantage_estimation() def test_generalized_advantage_estimation_retrace(self): values = torch.tensor([[2.] * 4], dtype=torch.float32) @@ -199,7 +217,7 @@ def test_generalized_advantage_estimation_retrace(self): td_lambda=td_lambda, importance_ratio=importance_ratio, time_major=False), expected) - +''' if __name__ == '__main__': unittest.main() diff --git a/setup.py b/setup.py index 579b90dbb..a8f3466ea 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ install_requires=[ 'atari_py == 0.1.7', 'cpplint', - 'clang-format == 9.0', + #'clang-format == 9.0', 'fasteners', 'gin-config@git+https://github.com/HorizonRobotics/gin-config.git', 'gym == 0.12.5', From 24661264ab83fbd725d4c35962a55c72418e3536 Mon Sep 17 00:00:00 2001 From: zhuboli <55901904+zhuboli@users.noreply.github.com> Date: Thu, 26 Nov 2020 08:05:28 +0800 Subject: [PATCH 9/9] merge function and fix bug --- .my_venv/bin/activate | 76 ----------------------------------- .my_venv/bin/activate.csh | 37 ----------------- .my_venv/bin/activate.fish | 75 ---------------------------------- .my_venv/bin/easy_install | 10 ----- .my_venv/bin/easy_install-3.7 | 10 ----- .my_venv/bin/pip | 10 ----- .my_venv/bin/pip3 | 10 ----- .my_venv/bin/pip3.7 | 10 ----- .my_venv/bin/python | 1 - .my_venv/bin/python3 | 1 - .my_venv/pyvenv.cfg | 3 -- alf/.vscode/launch.json | 22 ---------- 12 files changed, 265 deletions(-) delete mode 100644 .my_venv/bin/activate delete mode 100644 .my_venv/bin/activate.csh delete mode 100644 .my_venv/bin/activate.fish delete mode 100755 .my_venv/bin/easy_install delete mode 100755 .my_venv/bin/easy_install-3.7 delete mode 100755 .my_venv/bin/pip delete mode 100755 .my_venv/bin/pip3 delete mode 100755 .my_venv/bin/pip3.7 delete mode 120000 .my_venv/bin/python delete mode 120000 .my_venv/bin/python3 delete mode 100644 .my_venv/pyvenv.cfg delete mode 100644 alf/.vscode/launch.json diff --git a/.my_venv/bin/activate b/.my_venv/bin/activate deleted file mode 100644 index 1db2cc311..000000000 --- a/.my_venv/bin/activate +++ /dev/null @@ -1,76 +0,0 @@ -# This file must be used with "source bin/activate" *from bash* -# you cannot run it directly - -deactivate () { - # reset old environment variables - if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then - PATH="${_OLD_VIRTUAL_PATH:-}" - export PATH - unset _OLD_VIRTUAL_PATH - fi - if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then - PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}" - export PYTHONHOME - unset _OLD_VIRTUAL_PYTHONHOME - fi - - # This should detect bash and zsh, which have a hash command that must - # be called to get it to forget past commands. Without forgetting - # past commands the $PATH changes we made may not be respected - if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then - hash -r - fi - - if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then - PS1="${_OLD_VIRTUAL_PS1:-}" - export PS1 - unset _OLD_VIRTUAL_PS1 - fi - - unset VIRTUAL_ENV - if [ ! "${1:-}" = "nondestructive" ] ; then - # Self destruct! - unset -f deactivate - fi -} - -# unset irrelevant variables -deactivate nondestructive - -VIRTUAL_ENV="/Users/lizhubo/workspace/alf/.my_venv" -export VIRTUAL_ENV - -_OLD_VIRTUAL_PATH="$PATH" -PATH="$VIRTUAL_ENV/bin:$PATH" -export PATH - -# unset PYTHONHOME if set -# this will fail if PYTHONHOME is set to the empty string (which is bad anyway) -# could use `if (set -u; : $PYTHONHOME) ;` in bash -if [ -n "${PYTHONHOME:-}" ] ; then - _OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}" - unset PYTHONHOME -fi - -if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then - _OLD_VIRTUAL_PS1="${PS1:-}" - if [ "x(.my_venv) " != x ] ; then - PS1="(.my_venv) ${PS1:-}" - else - if [ "`basename \"$VIRTUAL_ENV\"`" = "__" ] ; then - # special case for Aspen magic directories - # see http://www.zetadev.com/software/aspen/ - PS1="[`basename \`dirname \"$VIRTUAL_ENV\"\``] $PS1" - else - PS1="(`basename \"$VIRTUAL_ENV\"`)$PS1" - fi - fi - export PS1 -fi - -# This should detect bash and zsh, which have a hash command that must -# be called to get it to forget past commands. Without forgetting -# past commands the $PATH changes we made may not be respected -if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then - hash -r -fi diff --git a/.my_venv/bin/activate.csh b/.my_venv/bin/activate.csh deleted file mode 100644 index ec285bc16..000000000 --- a/.my_venv/bin/activate.csh +++ /dev/null @@ -1,37 +0,0 @@ -# This file must be used with "source bin/activate.csh" *from csh*. -# You cannot run it directly. -# Created by Davide Di Blasi . -# Ported to Python 3.3 venv by Andrew Svetlov - -alias deactivate 'test $?_OLD_VIRTUAL_PATH != 0 && setenv PATH "$_OLD_VIRTUAL_PATH" && unset _OLD_VIRTUAL_PATH; rehash; test $?_OLD_VIRTUAL_PROMPT != 0 && set prompt="$_OLD_VIRTUAL_PROMPT" && unset _OLD_VIRTUAL_PROMPT; unsetenv VIRTUAL_ENV; test "\!:*" != "nondestructive" && unalias deactivate' - -# Unset irrelevant variables. -deactivate nondestructive - -setenv VIRTUAL_ENV "/Users/lizhubo/workspace/alf/.my_venv" - -set _OLD_VIRTUAL_PATH="$PATH" -setenv PATH "$VIRTUAL_ENV/bin:$PATH" - - -set _OLD_VIRTUAL_PROMPT="$prompt" - -if (! "$?VIRTUAL_ENV_DISABLE_PROMPT") then - if (".my_venv" != "") then - set env_name = ".my_venv" - else - if (`basename "VIRTUAL_ENV"` == "__") then - # special case for Aspen magic directories - # see http://www.zetadev.com/software/aspen/ - set env_name = `basename \`dirname "$VIRTUAL_ENV"\`` - else - set env_name = `basename "$VIRTUAL_ENV"` - endif - endif - set prompt = "[$env_name] $prompt" - unset env_name -endif - -alias pydoc python -m pydoc - -rehash diff --git a/.my_venv/bin/activate.fish b/.my_venv/bin/activate.fish deleted file mode 100644 index 54a7ffcfb..000000000 --- a/.my_venv/bin/activate.fish +++ /dev/null @@ -1,75 +0,0 @@ -# This file must be used with ". bin/activate.fish" *from fish* (http://fishshell.org) -# you cannot run it directly - -function deactivate -d "Exit virtualenv and return to normal shell environment" - # reset old environment variables - if test -n "$_OLD_VIRTUAL_PATH" - set -gx PATH $_OLD_VIRTUAL_PATH - set -e _OLD_VIRTUAL_PATH - end - if test -n "$_OLD_VIRTUAL_PYTHONHOME" - set -gx PYTHONHOME $_OLD_VIRTUAL_PYTHONHOME - set -e _OLD_VIRTUAL_PYTHONHOME - end - - if test -n "$_OLD_FISH_PROMPT_OVERRIDE" - functions -e fish_prompt - set -e _OLD_FISH_PROMPT_OVERRIDE - functions -c _old_fish_prompt fish_prompt - functions -e _old_fish_prompt - end - - set -e VIRTUAL_ENV - if test "$argv[1]" != "nondestructive" - # Self destruct! - functions -e deactivate - end -end - -# unset irrelevant variables -deactivate nondestructive - -set -gx VIRTUAL_ENV "/Users/lizhubo/workspace/alf/.my_venv" - -set -gx _OLD_VIRTUAL_PATH $PATH -set -gx PATH "$VIRTUAL_ENV/bin" $PATH - -# unset PYTHONHOME if set -if set -q PYTHONHOME - set -gx _OLD_VIRTUAL_PYTHONHOME $PYTHONHOME - set -e PYTHONHOME -end - -if test -z "$VIRTUAL_ENV_DISABLE_PROMPT" - # fish uses a function instead of an env var to generate the prompt. - - # save the current fish_prompt function as the function _old_fish_prompt - functions -c fish_prompt _old_fish_prompt - - # with the original prompt function renamed, we can override with our own. - function fish_prompt - # Save the return status of the last command - set -l old_status $status - - # Prompt override? - if test -n "(.my_venv) " - printf "%s%s" "(.my_venv) " (set_color normal) - else - # ...Otherwise, prepend env - set -l _checkbase (basename "$VIRTUAL_ENV") - if test $_checkbase = "__" - # special case for Aspen magic directories - # see http://www.zetadev.com/software/aspen/ - printf "%s[%s]%s " (set_color -b blue white) (basename (dirname "$VIRTUAL_ENV")) (set_color normal) - else - printf "%s(%s)%s" (set_color -b blue white) (basename "$VIRTUAL_ENV") (set_color normal) - end - end - - # Restore the return status of the previous command. - echo "exit $old_status" | . - _old_fish_prompt - end - - set -gx _OLD_FISH_PROMPT_OVERRIDE "$VIRTUAL_ENV" -end diff --git a/.my_venv/bin/easy_install b/.my_venv/bin/easy_install deleted file mode 100755 index f630a2891..000000000 --- a/.my_venv/bin/easy_install +++ /dev/null @@ -1,10 +0,0 @@ -#!/Users/lizhubo/workspace/alf/.my_venv/bin/python -# -*- coding: utf-8 -*- -import re -import sys - -from setuptools.command.easy_install import main - -if __name__ == '__main__': - sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0]) - sys.exit(main()) diff --git a/.my_venv/bin/easy_install-3.7 b/.my_venv/bin/easy_install-3.7 deleted file mode 100755 index f630a2891..000000000 --- a/.my_venv/bin/easy_install-3.7 +++ /dev/null @@ -1,10 +0,0 @@ -#!/Users/lizhubo/workspace/alf/.my_venv/bin/python -# -*- coding: utf-8 -*- -import re -import sys - -from setuptools.command.easy_install import main - -if __name__ == '__main__': - sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0]) - sys.exit(main()) diff --git a/.my_venv/bin/pip b/.my_venv/bin/pip deleted file mode 100755 index f51e24700..000000000 --- a/.my_venv/bin/pip +++ /dev/null @@ -1,10 +0,0 @@ -#!/Users/lizhubo/workspace/alf/.my_venv/bin/python -# -*- coding: utf-8 -*- -import re -import sys - -from pip._internal import main - -if __name__ == '__main__': - sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0]) - sys.exit(main()) diff --git a/.my_venv/bin/pip3 b/.my_venv/bin/pip3 deleted file mode 100755 index f51e24700..000000000 --- a/.my_venv/bin/pip3 +++ /dev/null @@ -1,10 +0,0 @@ -#!/Users/lizhubo/workspace/alf/.my_venv/bin/python -# -*- coding: utf-8 -*- -import re -import sys - -from pip._internal import main - -if __name__ == '__main__': - sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0]) - sys.exit(main()) diff --git a/.my_venv/bin/pip3.7 b/.my_venv/bin/pip3.7 deleted file mode 100755 index f51e24700..000000000 --- a/.my_venv/bin/pip3.7 +++ /dev/null @@ -1,10 +0,0 @@ -#!/Users/lizhubo/workspace/alf/.my_venv/bin/python -# -*- coding: utf-8 -*- -import re -import sys - -from pip._internal import main - -if __name__ == '__main__': - sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0]) - sys.exit(main()) diff --git a/.my_venv/bin/python b/.my_venv/bin/python deleted file mode 120000 index 3381f8782..000000000 --- a/.my_venv/bin/python +++ /dev/null @@ -1 +0,0 @@ -/opt/anaconda3/bin/python \ No newline at end of file diff --git a/.my_venv/bin/python3 b/.my_venv/bin/python3 deleted file mode 120000 index d8654aa0e..000000000 --- a/.my_venv/bin/python3 +++ /dev/null @@ -1 +0,0 @@ -python \ No newline at end of file diff --git a/.my_venv/pyvenv.cfg b/.my_venv/pyvenv.cfg deleted file mode 100644 index caaaab722..000000000 --- a/.my_venv/pyvenv.cfg +++ /dev/null @@ -1,3 +0,0 @@ -home = /opt/anaconda3/bin -include-system-site-packages = false -version = 3.7.6 diff --git a/alf/.vscode/launch.json b/alf/.vscode/launch.json deleted file mode 100644 index 0024ede62..000000000 --- a/alf/.vscode/launch.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - // Use IntelliSense to learn about possible attributes. - // Hover to view descriptions of existing attributes. - // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 - "version": "0.2.0", - "configurations": [ - { - "name": "train", - "type": "python", - "request": "launch", - "cwd": "/Users/lizhubo/workspace/alf/alf/examples", - "program": "/Users/lizhubo/workspace/alf/alf/bin/train.py", - "env": {"DISPLAY": ":8", - "CUDA_VISIBLE_DEVICES": "0"}, - "args": [ - "--root_dir=/Users/lizhubo/Desktop/Pytorch/Results50", - "--gin_file=/Users/lizhubo/workspace/alf/alf/examples/sarsa_sac_pendulum.gin", - ], - - } - ] -}