diff --git a/alf/algorithms/actor_critic_loss.py b/alf/algorithms/actor_critic_loss.py index 1a28727b2..2092dfe51 100644 --- a/alf/algorithms/actor_critic_loss.py +++ b/alf/algorithms/actor_critic_loss.py @@ -169,7 +169,10 @@ def _calc_returns_and_advantages(self, experience, value): values=value, step_types=experience.step_type, discounts=experience.discount * self._gamma, - td_lambda=self._lambda) + target_value=value, + td_lambda=self._lambda, + importance_ratio=1.0, + use_retrace=False) advantages = tensor_utils.tensor_extend_zero(advantages) if self._use_td_lambda_return: returns = advantages + value diff --git a/alf/algorithms/ddpg_algorithm.py b/alf/algorithms/ddpg_algorithm.py index 2cad08fc7..ebc977ce4 100644 --- a/alf/algorithms/ddpg_algorithm.py +++ b/alf/algorithms/ddpg_algorithm.py @@ -326,7 +326,8 @@ def calc_loss(self, experience, train_info: DdpgInfo): critic_losses[i] = self._critic_losses[i]( experience=experience, value=train_info.critic.q_values[:, :, i, ...], - target_value=train_info.critic.target_q_values).loss + target_value=train_info.critic.target_q_values, + train_info = train_info).loss critic_loss = math_ops.add_n(critic_losses) diff --git a/alf/algorithms/ppo_algorithm.py b/alf/algorithms/ppo_algorithm.py index 72858d7b3..abdb59954 100644 --- a/alf/algorithms/ppo_algorithm.py +++ b/alf/algorithms/ppo_algorithm.py @@ -46,7 +46,10 @@ def preprocess_experience(self, exp: Experience): values=exp.rollout_info.value, step_types=exp.step_type, discounts=exp.discount * self._loss._gamma, + target_value=exp.rollout_info.value, td_lambda=self._loss._lambda, + importance_ratio=1.0, + use_retrace=False, time_major=False) advantages = torch.cat([ advantages, diff --git a/alf/algorithms/sac_algorithm.py b/alf/algorithms/sac_algorithm.py index da9ef80f6..e51ceef8c 100644 --- a/alf/algorithms/sac_algorithm.py +++ b/alf/algorithms/sac_algorithm.py @@ -757,7 +757,8 @@ def _calc_critic_loss(self, experience, train_info: SacInfo): critic_losses.append( l(experience=experience, value=critic_info.critics[:, :, i, ...], - target_value=critic_info.target_critic).loss) + target_value=critic_info.target_critic, + train_info = train_info).loss) critic_loss = math_ops.add_n(critic_losses) diff --git a/alf/algorithms/sarsa_algorithm.py b/alf/algorithms/sarsa_algorithm.py index 86d07a74f..c451b52df 100644 --- a/alf/algorithms/sarsa_algorithm.py +++ b/alf/algorithms/sarsa_algorithm.py @@ -435,7 +435,7 @@ def calc_loss(self, experience, info: SarsaInfo): target_critic = tensor_utils.tensor_prepend_zero( info.target_critics) loss_info = self._critic_losses[i](shifted_experience, critic, - target_critic) + target_critic, info) critic_losses.append(nest_map(lambda l: l[:-1], loss_info.loss)) critic_loss = math_ops.add_n(critic_losses) diff --git a/alf/algorithms/td_loss.py b/alf/algorithms/td_loss.py index bbf7f9d92..49b96cf06 100644 --- a/alf/algorithms/td_loss.py +++ b/alf/algorithms/td_loss.py @@ -31,6 +31,7 @@ def __init__(self, td_error_loss_fn=element_wise_squared_loss, td_lambda=0.95, normalize_target=False, + use_retrace=False, debug_summaries=False, name="TDLoss"): r"""Create a TDLoss object. @@ -46,7 +47,8 @@ def __init__(self, :math:`G_t^\lambda = \hat{A}^{GAE}_t + V(s_t)` where the generalized advantage estimation is defined as: :math:`\hat{A}^{GAE}_t = \sum_{i=t}^{T-1}(\gamma\lambda)^{i-t}(R_{i+1} + \gamma V(s_{i+1}) - V(s_i))` - + use_retrace = False means one step or multi_step loss, use_retrace = True means retrace loss + :math:`\mathcal{R} Q(x, a):=Q(x, a)+\mathbb{E}_{\mu}\left[\sum_{t \geq 0} \gamma^{t}\left(\prod_{s=1}^{t} c_{s}\right)\left(r_{t}+\gamma \mathbb{E}_{\pi} Q\left(x_{t+1}, \cdot\right)-Q\left(x_{t}, a_{t}\right)\right)\right]` References: Schulman et al. `High-Dimensional Continuous Control Using Generalized Advantage Estimation @@ -55,6 +57,9 @@ def __init__(self, Sutton et al. `Reinforcement Learning: An Introduction `_, Chapter 12, 2018 + Remi Munos et al. `Safe and efficient off-policy reinforcement learning + `_ + Args: gamma (float): A discount factor for future rewards. td_errors_loss_fn (Callable): A function for computing the TD errors @@ -76,8 +81,9 @@ def __init__(self, self._debug_summaries = debug_summaries self._normalize_target = normalize_target self._target_normalizer = None + self._use_retrace = use_retrace - def forward(self, experience, value, target_value): + def forward(self, experience, value, target_value, train_info): """Cacluate the loss. The first dimension of all the tensors is time dimension and the second @@ -91,6 +97,11 @@ def forward(self, experience, value, target_value): target_value (torch.Tensor): the time-major tensor for the value at each time step. This is used to calculate return. ``target_value`` can be same as ``value``. + train_info : train_info includes action distrbution, actor, critic and + other information. Different algorithm may have different info inside. + For the retrace method, we can use SarsaInfo, SacInfo or DdpgInfo as train_info + for Sac, Sarsa or Ddpg algorithm. Adding train_info to calculate importance_ratio + and importance_ratio_clipped. Returns: LossInfo: with the ``extra`` field same as ``loss``. """ @@ -106,15 +117,57 @@ def forward(self, experience, value, target_value): values=target_value, step_types=experience.step_type, discounts=experience.discount * self._gamma) - else: + elif self._use_retrace == False: + scope = alf.summary.scope(self.__class__.__name__) + importance_ratio, importance_ratio_clipped = value_ops. \ + action_importance_ratio( + action_distribution=train_info.action_distribution, + collect_action_distribution=experience.rollout_info. + action_distribution, + action=experience.action, + clipping_mode='capping', + importance_ratio_clipping=0.0, + log_prob_clipping=0.0, + scope=scope, + check_numerics=False, + debug_summaries=self._debug_summaries) advantages = value_ops.generalized_advantage_estimation( rewards=experience.reward, values=target_value, step_types=experience.step_type, + target_value=target_value, + importance_ratio=importance_ratio, + use_retrace=False, discounts=experience.discount * self._gamma, td_lambda=self._lambda) returns = advantages + target_value[:-1] + else: + scope = alf.summary.scope(self.__class__.__name__) + importance_ratio, importance_ratio_clipped = value_ops. \ + action_importance_ratio( + action_distribution=train_info.action_distribution, + collect_action_distribution=experience.rollout_info. + action_distribution, + action=experience.action, + clipping_mode='capping', + importance_ratio_clipping=0.0, + log_prob_clipping=0.0, + scope=scope, + check_numerics=False, + debug_summaries=self._debug_summaries) + advantages = value_ops.generalized_advantage_estimation( + importance_ratio=importance_ratio_clipped, + rewards=experience.reward, + values=value, + target_value=target_value, + step_types=experience.step_type, + discounts=experience.discount * self._gamma, + use_retrace=True, + time_major=True, + td_lambda=self._lambda) + returns = advantages + value[:-1] + returns = returns.detach() value = value[:-1] if self._normalize_target: if self._target_normalizer is None: diff --git a/alf/examples/carla.gin b/alf/examples/carla.gin index 39a1d1f66..e42a2c8b7 100644 --- a/alf/examples/carla.gin +++ b/alf/examples/carla.gin @@ -4,8 +4,8 @@ import alf import alf.algorithms.merlin_algorithm import alf.environments.suite_carla -CameraSensor.image_size_x=200 -CameraSensor.image_size_y=100 +CameraSensor.image_size_x=128 +CameraSensor.image_size_y=64 CameraSensor.fov=135 create_environment.env_name='Town01' diff --git a/alf/utils/value_ops.py b/alf/utils/value_ops.py index a6bf85a23..583c14954 100644 --- a/alf/utils/value_ops.py +++ b/alf/utils/value_ops.py @@ -195,6 +195,9 @@ def generalized_advantage_estimation(rewards, values, step_types, discounts, + target_value, + importance_ratio, + use_retrace=False, td_lambda=1.0, time_major=True): """Computes generalized advantage estimation (GAE) for the first T-1 steps. @@ -231,6 +234,8 @@ def generalized_advantage_estimation(rewards, rewards = rewards.transpose(0, 1) values = values.transpose(0, 1) step_types = step_types.transpose(0, 1) + importance_ratio = importance_ratio.transpose(0, 1) + target_value = target_value.transpose(0, 1) assert values.shape[0] >= 2, ("The sequence length needs to be " "at least 2. Got {s}".format( @@ -240,18 +245,76 @@ def generalized_advantage_estimation(rewards, is_lasts = common.expand_dims_as(is_lasts, values) discounts = common.expand_dims_as(discounts, values) - weighted_discounts = discounts[1:] * td_lambda + advs = torch.zeros_like(values) + if use_retrace == False: + weighted_discounts = discounts[1:] * td_lambda + delta = rewards[1:] + discounts[1:] * values[1:] - values[:-1] + with torch.no_grad(): + for t in reversed(range(rewards.shape[0] - 1)): + advs[t] = (1 - is_lasts[t]) * \ + (delta[t] + weighted_discounts[t] * advs[t + 1]) + advs = advs[:-1] + else: + delta = (rewards[1:] + discounts[1:] * target_value[1:] - values[:-1]) + weighted_discounts = discounts[1:] * td_lambda * importance_ratio + with torch.no_grad(): + for t in reversed(range(rewards.shape[0] - 1)): + advs[t] = (1 - is_lasts[t]) * \ + (delta[t] + weighted_discounts[t] * advs[t + 1]) + advs = advs[:-1] + + if not time_major: + advs = advs.transpose(0, 1) + return advs.detach() + + +''' +# add for the retrace method +def generalized_advantage_estimation_retrace(importance_ratio, discounts, + rewards, td_lambda, time_major, + values, target_value, step_types): + """ + compute the generalized advantage estimation for retrace method. Main change is adding + importance ratio + + Args: + importance_ratio: shape is [T], scalar between [0,1]. Representing importance ratio + rewards (Tensor): shape is [T, B] (or [T]) representing rewards. + values (Tensor): shape is [T,B] (or [T]) representing values. + step_types (Tensor): shape is [T,B] (or [T]) representing step types. + discounts (Tensor): shape is [T, B] (or [T]) representing discounts. + td_lambda (float): A scalar between [0, 1]. It's used for variance + reduction in temporal difference. + time_major (bool): Whether input tensors are time major. + False means input tensors have shape [B, T]. + Returns: + A tensor with shape [T-1, B] representing advantages. Shape is [B, T-1] + when time_major is false. + """ + if not time_major: + discounts = discounts.transpose(0, 1) + rewards = rewards.transpose(0, 1) + values = values.transpose(0, 1) + step_types = step_types.transpose(0, 1) + importance_ratio = importance_ratio.transpose(0, 1) + target_value = target_value.transpose(0, 1) + + assert values.shape[0] >= 2, ("The sequence length needs to be " + "at least 2. Got {s}".format( + s=values.shape[0])) advs = torch.zeros_like(values) - delta = rewards[1:] + discounts[1:] * values[1:] - values[:-1] + is_lasts = (step_types == StepType.LAST).to(dtype=torch.float32) + delta = (rewards[1:] + discounts[1:] * target_value[1:] - values[:-1]) + weighted_discounts = discounts[1:] * td_lambda * importance_ratio with torch.no_grad(): for t in reversed(range(rewards.shape[0] - 1)): advs[t] = (1 - is_lasts[t]) * \ (delta[t] + weighted_discounts[t] * advs[t + 1]) advs = advs[:-1] - if not time_major: advs = advs.transpose(0, 1) return advs.detach() +''' diff --git a/alf/utils/value_ops_test.py b/alf/utils/value_ops_test.py index ebd526127..cd5d19f39 100644 --- a/alf/utils/value_ops_test.py +++ b/alf/utils/value_ops_test.py @@ -96,14 +96,17 @@ class GeneralizedAdvantageTest(unittest.TestCase): """Tests for alf.utils.value_ops.generalized_advantage_estimation """ - def _check(self, rewards, values, step_types, discounts, td_lambda, - expected): + def _check(self, rewards, values, step_types, discounts, target_value, + importance_ratio, use_retrace, td_lambda, expected): np.testing.assert_array_almost_equal( value_ops.generalized_advantage_estimation( rewards=rewards, values=values, step_types=step_types, discounts=discounts, + target_value=target_value, + importance_ratio=importance_ratio, + use_retrace=use_retrace, td_lambda=td_lambda, time_major=False), expected) @@ -113,6 +116,9 @@ def _check(self, rewards, values, step_types, discounts, td_lambda, values=torch.stack([values, 2 * values], dim=2), step_types=step_types, discounts=discounts, + importance_ratio=importance_ratio, + target_value=target_value, + use_retrace=use_retrace, td_lambda=td_lambda, time_major=False), torch.stack([expected, 2 * expected], dim=2), @@ -124,7 +130,9 @@ def test_generalized_advantage_estimation(self): rewards = torch.tensor([[3.] * 5], dtype=torch.float32) discounts = torch.tensor([[0.9] * 5], dtype=torch.float32) td_lambda = 0.6 / 0.9 - + target_value = torch.tensor([[3.] * 4], dtype=torch.float32) + importance_ratio = torch.tensor([[0.8] * 3], dtype=torch.float32) + use_retrace = False d = 2 * 0.9 + 1 expected = torch.tensor([[((d * 0.6 + d) * 0.6 + d) * 0.6 + d, (d * 0.6 + d) * 0.6 + d, d * 0.6 + d, d]], @@ -134,7 +142,10 @@ def test_generalized_advantage_estimation(self): values=values, step_types=step_types, discounts=discounts, + importance_ratio=importance_ratio, + target_value=target_value, td_lambda=td_lambda, + use_retrace=use_retrace, expected=expected) # two episodes, and exceed by time limit (discount=1) @@ -150,7 +161,10 @@ def test_generalized_advantage_estimation(self): values=values, step_types=step_types, discounts=discounts, + importance_ratio=importance_ratio, + target_value=target_value, td_lambda=td_lambda, + use_retrace=use_retrace, expected=expected) # tow episodes, and end normal (discount=0) @@ -169,8 +183,41 @@ def test_generalized_advantage_estimation(self): step_types=step_types, discounts=discounts, td_lambda=td_lambda, + importance_ratio=importance_ratio, + target_value=target_value, + use_retrace=use_retrace, expected=expected) +''' +class GeneralizedAdvantage_retrace_Test(unittest.TestCase): + """Tests for alf.utils.value_ops + """GeneralizedAdvantageTest.test_generalized_advantage_estimation() + + def test_generalized_advantage_estimation_retrace(self): + values = torch.tensor([[2.] * 4], dtype=torch.float32) + step_types = torch.tensor([[StepType.MID] * 4], dtype=torch.int64) + rewards = torch.tensor([[3.] * 4], dtype=torch.float32) + discounts = torch.tensor([[0.9] * 4], dtype=torch.float32) + td_lambda = 0.6 / 0.9 + target_value = torch.tensor([[3.] * 4], dtype=torch.float32) + importance_ratio = torch.tensor([[0.8] * 3], dtype=torch.float32) + d = 3 * 0.9 + 3 - 2 + expected = torch.tensor( + [[(d * 0.6 * 0.8) * 0.6 * 0.8 + 0.6 * 0.8 * d + d, + d * 0.6 * 0.8 + d, d]], + dtype=torch.float32) + np.testing.assert_array_almost_equal( + value_ops.generalized_advantage_estimation_retrace( + rewards=rewards, + values=values, + target_value=target_value, + step_types=step_types, + discounts=discounts, + td_lambda=td_lambda, + importance_ratio=importance_ratio, + time_major=False), expected) +''' + if __name__ == '__main__': unittest.main() diff --git a/setup.py b/setup.py index 579b90dbb..a8f3466ea 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ install_requires=[ 'atari_py == 0.1.7', 'cpplint', - 'clang-format == 9.0', + #'clang-format == 9.0', 'fasteners', 'gin-config@git+https://github.com/HorizonRobotics/gin-config.git', 'gym == 0.12.5',